In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from charles_dicken_qa_chatbot.workflow import RAGFlow
from charles_dicken_qa_chatbot.constants import (
    OPIK_BASE_URL,
    OPIK_PROJ_NAME,
    LLM_MODEL,
    COLLECTION_NAME,
    QDRANT_HOST,
    QDRANT_PORT,
    REDIS_HOST,
    REDIS_PORT,
)
from llama_index.core.workflow import Context, JsonSerializer
import pandas as pd

## Initialization

In [None]:
workflow = RAGFlow(
    opik_host=OPIK_BASE_URL,
    opik_project_name=OPIK_PROJ_NAME,
    llm_model_name=LLM_MODEL,
    collection_name=COLLECTION_NAME,
    qdrant_host=QDRANT_HOST,
    qdrant_port=QDRANT_PORT,
    redis_host=REDIS_HOST,
    redis_port=REDIS_PORT,
    timeout=None,
)
ctx = Context(workflow)

## Ingestion

The following code will assume that there is already data in Qdrant (given that Qdrant is preloaded with backup snapshot)

In [None]:
# Use default preloaded snapshot in Qdrant
nodes = await workflow.run(from_default=True, ctx=ctx)
await workflow.run(initialize_ctx=True, ctx=ctx)

This cell block will load Qdrant database from scratch given Gutenberg IDs stored in a csv file

In [None]:
## Load documents from source path
# source_path = "../data/dickens_books.csv"
# nodes = await workflow.run(source_path=source_path, ctx=ctx)

## Evaluation

- Due to the large dataset, it's best to keep `sample_percentage` variable low so it doesn't take too much time to run. The tradeoff is that it might not best reflect the diversity of the content in evaluation dataset.
- Also since async can get finicky so timeout or handling errors can pop up a lot. It's best to persist the data for backup and future reference.

### Retriever evaluation

After ingesting data, you can choose either way below to run a retriever evaluation

In [None]:
# Load dataset generated from file
await workflow.run(
    qa_json_load_path="qa_dataset.json",
    ctx=ctx,
)

retriever_results_table = await ctx.store.get("retriever_results_table")
pd.read_json(retriever_results_table)

In [None]:
# Run data generation from scratch
# This will also create a json file called "qa_dataset.json"
# upon completion of synthetic data generation
# await workflow.run(
#     qa_nodes=nodes,
#     similarity_top_k=3,
#     num_questions_per_chunk=1,
#     sample_percentage=0.05,
#     ctx=ctx,
# )

# retriever_results_table = await ctx.store.get("retriever_results_table")
# pd.read_json(retriever_results_table)

### Response evaluation

Opik endpoint for datasets: http://localhost:5173/default/datasets

In [None]:
# Run synthetic data generation for chunks and then upload the dataset to Opik,
# run response evaluation on them
# The dataset is called "gpt-5-nano-charles_dickens-eval-YYYY-MM-DD"
# response_eval_result = await workflow.run(
#     opik_nodes=nodes,
#     num_questions_per_chunk=1,
#     sample_percentage=0.05,
#     opik=True,
#     ctx=ctx,
# )

In [None]:
# If the dataset is already uploaded to Opik then the response evaluation
# can be run without synthetic data generation

opik_dataset_name = "gpt-5-nano-charles_dickens-eval-2025-10-01"

response_eval_result = await workflow.run(
    opik_dataset_name=opik_dataset_name,
    num_questions_per_chunk=1,
    sample_percentage=0.05,
    opik=True,
    ctx=ctx,
)

In [None]:
response = await workflow.run(
    query="What is 'Great Expectation' novel's main theme?", ctx=ctx
)

In [None]:
response.response

### Save states

In [None]:
import redis
import json

In [None]:
redis_client = redis.Redis(
    host=REDIS_HOST, port=REDIS_PORT, db=0, decode_responses=True
)

In [None]:
ctx_dict = ctx.to_dict(serializer=JsonSerializer())

In [None]:
redis_client.set("ctx", json.dumps(ctx_dict))

# Load from settings

In [None]:
loaded_ctx_dict = json.loads(redis_client.get("ctx"))

In [None]:
aworkflow = RAGFlow(
    opik_host=OPIK_BASE_URL,
    opik_project_name=OPIK_PROJ_NAME,
    llm_model_name=LLM_MODEL,
    collection_name=COLLECTION_NAME,
    qdrant_host=QDRANT_HOST,
    qdrant_port=QDRANT_PORT,
    redis_host=REDIS_HOST,
    redis_port=REDIS_PORT,
    timeout=180,
)

restored_ctx = Context.from_dict(
    aworkflow, loaded_ctx_dict, serializer=JsonSerializer()
)

In [None]:
await aworkflow.run(initialize_ctx=True, ctx=restored_ctx)

In [None]:
query_str = "What is 'A Christmas Carol' novel's main theme?"
response = await aworkflow.run(query=query_str, ctx=restored_ctx)

In [None]:
response

In [None]:
response = await aworkflow.run(
    query="What is the book stored in database?", ctx=restored_ctx
)

In [None]:
response.response

In [None]:
await aworkflow.run(query="What is the book stored in database?")

In [None]:
response = await aworkflow.run(
    query="What is 'A Christmas Carol' novel's main theme?", ctx=restored_ctx
)

In [None]:
response.response

In [None]:
response = await aworkflow.run(
    query="What is 'A Christmas Carol' novel's main theme?", ctx=restored_ctx
)