In [None]:
import os
import pandas as pd
from dotenv import load_dotenv

import qdrant_client
import gutenbergpy.textget

# from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex, Settings, Document
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.openai import OpenAI
# from llama_index.core.indices.query.query_transform import HyDEQueryTransform

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [None]:
LLM_MODEL = "gpt-5-mini"
EMBED_MODEL = "BAAI/bge-base-en-v1.5"
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333
REDIS_HOST = "localhost"
REDIS_PORT = 6379

In [None]:
embed_model = FastEmbedEmbedding(model_name=EMBED_MODEL)
Settings.embed_model = embed_model

llm = OpenAI(model=LLM_MODEL)

In [None]:
client = qdrant_client.QdrantClient(host="localhost", port=6333)

COLLECTION = "charles_dickens"

vector_store = QdrantVectorStore(
    client=client,
    enable_hybrid=True,
    fastembed_sparse_model="Qdrant/bm25",
    collection_name=COLLECTION,
)



# Indexing Pipeline

## Loading

In [None]:
path = "../data/test.csv"
df = pd.read_csv(path)
df

In [None]:
from llama_index.readers.wikipedia import WikipediaReader

reader = WikipediaReader()

docs = []

for _, row in df.iterrows():
    book_id = row["Gutenberg ID"]
    book_title = row["Title"]
    book_text = (
        gutenbergpy.textget.get_text_by_id(book_id)
        .decode("utf-8")
        .replace("\r\n", "\n")
    )
    wiki_doc = reader.load_data(pages=[book_title])
    docs.extend(
        [
            Document(text=book_text, metadata={"title": book_title, "source": "book"}),
            Document(
                text=wiki_doc[0].text,
                metadata={"title": book_title, "source": "wikipedia"},
            ),
        ]
    )

In [None]:
docs

## Indexing

In [None]:
from llama_index.core.extractors import (
    TitleExtractor,
    # QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=128,
    separator=" ",
)

title_extractor = TitleExtractor(nodes=5)

In [None]:
from llama_index.core.ingestion import IngestionPipeline, IngestionCache, DocstoreStrategy
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore

REDIS_HOST = "localhost"
REDIS_PORT = 6379

redis_docstore = RedisDocumentStore(
    host=REDIS_HOST, port=REDIS_PORT
)
redis_cache = RedisCache(host=REDIS_HOST, port=REDIS_PORT, collection="redis_cache")

pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor, embed_model],
    vector_store=vector_store,
    docstore=redis_docstore,
    cache=redis_cache,
    docstore_strategy=DocstoreStrategy.UPSERTS,
)

## Storing

In [None]:
nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

In [None]:
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Generation Pipeline

## Querying

In [None]:
index = VectorStoreIndex.from_vector_store(vector_store)

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k=2, sparse_top_k=12, vector_store_query_mode="hybrid"
)

In [None]:
response = query_engine.query("What is 'A Christmas Carol' novel's main theme?")
print(response)

## Evaluation