In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

import qdrant_client
import gutenbergpy.textget

from charles_dicken_qa_chatbot.constants import *

# from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex, Settings, Document
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.llms.openai import OpenAI
# from llama_index.core.indices.query.query_transform import HyDEQueryTransform

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
embed_model = FastEmbedEmbedding(model_name=EMBED_MODEL)
Settings.embed_model = embed_model

llm = OpenAI(model=LLM_MODEL)

Fetching 5 files: 100%|██████████| 5/5 [00:02<00:00,  1.89it/s]


In [3]:
client = qdrant_client.QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

vector_store = QdrantVectorStore(
    client=client,
    enable_hybrid=True,
    fastembed_sparse_model="Qdrant/bm25",
    collection_name=COLLECTION_NAME,
)


Fetching 18 files: 100%|██████████| 18/18 [00:01<00:00, 10.59it/s]


# Indexing Pipeline

## Loading

In [4]:
path = "../data/test.csv"
df = pd.read_csv(path)
df

Unnamed: 0,Gutenberg ID,Title
0,46,A Christmas Carol


In [5]:
from llama_index.readers.wikipedia import WikipediaReader

reader = WikipediaReader()

docs = []

for _, row in df.iterrows():
    book_id = row["Gutenberg ID"]
    book_title = row["Title"]
    book_text = (
        gutenbergpy.textget.get_text_by_id(book_id)
        .decode("utf-8")
        .replace("\r\n", "\n")
    )
    wiki_doc = reader.load_data(pages=[book_title])
    docs.extend(
        [
            Document(text=book_text, metadata={"title": book_title, "source": "book"}),
            Document(
                text=wiki_doc[0].text,
                metadata={"title": book_title, "source": "wikipedia"},
            ),
        ]
    )

In [6]:
docs

 Document(id_='861bec46-301e-400f-b8f7-7d74285376c4', embedding=None, metadata={'title': 'A Christmas Carol', 'source': 'wikipedia'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='A Christmas Carol. In Prose. Being a Ghost Story of Christmas, commonly known as A Christmas Carol, is a novella by Charles Dickens, first published in London by Chapman & Hall in 1843 and illustrated by John Leech. It recounts the story of Ebenezer Scrooge, an elderly miser who is visited by the ghost of his former business partner Jacob Marley and the spirits of Christmas Past, Present and Yet to Come. In the process, Scrooge is transformed into a kinder, gentler man.\nDickens wrote A Christmas Carol during a period when the British were exploring and re-evaluating past Christmas traditions, including carols, and newer customs such as cards and Christ

## Indexing

In [7]:
from llama_index.core.extractors import (
    TitleExtractor,
    # QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=128,
    separator=" ",
)

title_extractor = TitleExtractor(nodes=5)

In [12]:
from llama_index.core.ingestion import (
    IngestionPipeline,
    IngestionCache,
    DocstoreStrategy,
)
from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache
from llama_index.storage.docstore.redis import RedisDocumentStore

redis_docstore = RedisDocumentStore.from_host_and_port(
    host=REDIS_HOST, port=REDIS_PORT, namespace=COLLECTION_NAME
)
redis_cache = IngestionCache(
    cache=RedisCache.from_host_and_port(host=REDIS_HOST, port=REDIS_PORT),
    collection=COLLECTION_NAME,
)

pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor, embed_model],
    vector_store=vector_store,
    docstore=redis_docstore,
    cache=redis_cache,
    docstore_strategy=DocstoreStrategy.UPSERTS,
)

In [15]:
nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

Parsing nodes: 0it [00:00, ?it/s]
0it [00:00, ?it/s]
Generating embeddings: 0it [00:00, ?it/s]


# Generation Pipeline

## Querying

In [None]:
index = VectorStoreIndex.from_vector_store(vector_store)

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k=2, sparse_top_k=12, vector_store_query_mode="hybrid"
)

In [None]:
response = query_engine.query("What is 'A Christmas Carol' novel's main theme?")
print(response)

## Evaluation