In [27]:
#RAG Application
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever, QueryFusionRetriever
from llama_index.core.prompts import PromptTemplate
from llama_index.core.query_engine import RetrieverQueryEngine, MultiStepQueryEngine
from llama_index.core.indices.query.query_transform.base import StepDecomposeQueryTransform
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from llama_index.core import get_response_synthesizer
from llama_index.core import Document
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import load_index_from_storage
from llama_index.core import ServiceContext
import chromadb
from chromadb.config import Settings
from llama_index.llms.huggingface import HuggingFaceLLM

from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader

from typing import List

import os
import warnings
#import asyncio
#import nest_asyncio  # ✅ Required for Jupyter
#nest_asyncio.apply()  # ✅ Allows asyncio.run() inside Jupyter


In [28]:
class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both semantic search and hybrid search."""

    def __init__(
        self,
        vector_retriever,
        keyword_retriever,
        mode: str = "OR",
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)

        vector_ids = {n.node.node_id for n in vector_nodes}
        keyword_ids = {n.node.node_id for n in keyword_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in keyword_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(keyword_ids)
        else:
            retrieve_ids = vector_ids.union(keyword_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [29]:
# Get the PDF document
warnings.filterwarnings("ignore")

def load_pdf(data):
    num_books = len([f for f in os.listdir(data) if f.endswith(".pdf")])
    loader=DirectoryLoader(data, glob="*.pdf", loader_cls= PyMuPDFLoader)
    documents = loader.load()

    return documents, num_books

#Get data from PDF
temp_documents, num_books = load_pdf("../data/")
print("done loading...")
print(f"Number of books (PDFs) loaded: {num_books}")

# Convert each LangChain Document to a LlamaIndex Document
documents = [Document.from_langchain_format(doc) for doc in temp_documents]


done loading...
Number of books (PDFs) loaded: 2


In [30]:
#Inintialization

# Use a custom LLMs (Ollama)
llm1 = Ollama(model="deepseek-r1:14b", temperature=0.1, streaming=True)
llm2 = Ollama(model="deepseek-r1:14b", temperature=0.1, streaming=True)


Settings.llm = llm2

# Use a custom embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en")

# Function to process and index a batch of documents
text_splitter = TokenTextSplitter(chunk_size = 512, chunk_overlap=128)

In [5]:
#Create database for embedded vectors in order to utilize vector search for semantic search


# create client and a new collection
chroma_client_embed = chromadb.PersistentClient(path="./mychromadb_for_rag_embeddedvectors")
chroma_collection_embed = chroma_client_embed.get_or_create_collection("embedded_vectors")

#Create vector store for the embedded vectors
vector_store_embed = ChromaVectorStore(chroma_collection=chroma_collection_embed)

def process_and_index_batch(documents_batch):
    # Split documents into nodes
    nodes = text_splitter.get_nodes_from_documents(documents_batch)

    # Generate embeddings for each node
    for node in nodes:
        node.embedding = embed_model.get_text_embedding(node.get_content())

    # Add nodes to the vector store
    vector_store_embed.add(nodes)


#Batch process for documents tu put them in database
batch_size = 500  # Adjust based on your system's capacity
for i in range(0, len(documents), batch_size):
    batch = documents[i:i + batch_size]
    process_and_index_batch(batch)
    


In [6]:
#Create database for text chunks - nodes in order to utilize bm25 search for word search

# create client and a new collection
chroma_client_nodes = chromadb.PersistentClient(path="./mychromadb_for_rag_nodes")
chroma_collection_nodes = chroma_client_nodes.get_or_create_collection("nodes")

#Create vector store for the embedded vectors
vector_store_nodes = ChromaVectorStore(chroma_collection=chroma_collection_nodes)


# Function to process and index a batch of documents
def process_and_index_batch(documents_batch):
    # Split documents into nodes
    nodes = text_splitter.get_nodes_from_documents(documents_batch)

    # Prepare data for Chroma
    ids = [node.node_id for node in nodes]
    documents = [node.get_content() for node in nodes]
    metadatas = [{"source": node.ref_doc_id} for node in nodes]

    # Add nodes to the Chroma collection; embeddings will be generated automatically
    chroma_collection_nodes.add(ids=ids, documents=documents, metadatas=metadatas)


#Batch process for documents tu put them in database
batch_size = 500  # Adjust based on your system's capacity
for i in range(0, len(documents), batch_size):
    batch = documents[i:i + batch_size]
    process_and_index_batch(batch)

In [3]:
#Read database to get stored data
chroma_client_embed = chromadb.PersistentClient(path="./mychromadb_for_rag_embeddedvectors")
chroma_collection_embed = chroma_client_embed.get_or_create_collection("embedded_vectors")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection_embed)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model=embed_model, storage_context=storage_context)



In [4]:
#Read database to get stored data
chroma_client_nodes = chromadb.PersistentClient(path="./mychromadb_for_rag_nodes")
chroma_collection_nodes = chroma_client_nodes.get_or_create_collection("nodes")


# Initialize a simple document store
docstore = SimpleDocumentStore()


# Load documents from ChromaDB into the document store
def load_documents_from_chroma(batch_size=500):
    offset = 0
    while True:
        # Retrieve a batch of documents from the collection
        result = chroma_collection_nodes.get(
            include=['documents', 'metadatas'],
            offset=offset,
            limit=batch_size
        )

        # Extract the retrieved data
        documents = result.get('documents', [])
        metadatas = result.get('metadatas', [])
        ids = result.get('ids', [])  # 'ids' are included by default
        
        # Check if the result contains documents
        if not result['documents']:
            break  # Exit the loop if no more documents are found
        
        # Create Document objects and add them to the SimpleDocumentStore
        doc_objects = [
            Document(text=doc, doc_id=doc_id, extra_info=metadata)
            for doc, metadata, doc_id in zip(documents, metadatas, ids)
        ]
        docstore.add_documents(doc_objects)
        
        # Update the offset for the next batch
        offset += batch_size

load_documents_from_chroma(batch_size=500)

In [25]:
#Prepare retrievers, create hybrid retriever and create query engine

#Create a vector_retriever using StorageContext
vector_retriever = vector_index.as_retriever(similarity_top_k=7)
# Create a BM25 Retriever (Keyword search)
bm25_retriever = BM25Retriever.from_defaults(docstore=docstore , similarity_top_k=10)

# Define Custom Query Prompt
query_prompt = PromptTemplate("""You are an AI language model assistant. Your task is obtaining keywords from the queries 
                              and retrieving relevant information from context. If the queries part has multiple queries, 
                              process them, separatetly.
                              Context:{context_str}                        
                              Queries: {query_str}"""
                             )




# define custom retriever as hybrid
#hybrid_retriever = CustomRetriever(vector_retriever=vector_retriever, keyword_retriever=bm25_retriever)
hybrid_retriever = QueryFusionRetriever(
    [vector_retriever, bm25_retriever], 
    llm = llm1,
    query_gen_prompt=query_prompt,
    similarity_top_k = 7,
    num_queries=1,
    mode="reciprocal_rerank",  # Fusion mode
    retriever_weights=[1.0, 0.0],
    use_async=False,
    verbose=False,
    )

# Define RAG Context-Aware Prompt
rag_prompt = PromptTemplate(
    template="""Answer the questions using only the provided context. If there are multiple questions, answer each of them seperately.
              Provide the document name and relevant page numbers after the answer as the following format:
              Page no: page number of the answer
              Document name: source document name of the answer
              Document file path: source document file path
              Context:
              {context_str}
              Questions: {query_str}

              If there is totally no relevant information in the context, only say 'I don't know.'
              """
              
            
)

# define response synthesizer
response_synthesizer = get_response_synthesizer(
    llm=llm2,
    text_qa_template=rag_prompt,
    streaming=True  # Enable streaming if supported
)

# Create a query engine
query_engine = RetrieverQueryEngine(retriever=hybrid_retriever,
                                    response_synthesizer=response_synthesizer, 
                                    )

query_engine = RetrieverQueryEngine(
    retriever=hybrid_retriever,
    response_synthesizer=response_synthesizer,
)







In [None]:
# Settings.llm=llm2

# # define response synthesizer
# response_synthesizer = get_response_synthesizer(
#     llm=llm2,
#     text_qa_template=rag_prompt,
#     streaming=True  # Enable streaming if supported
# )


# # Create a base query engine using your hybrid retriever and response synthesizer
# base_query_engine = RetrieverQueryEngine(
#     retriever=hybrid_retriever,
#     response_synthesizer=response_synthesizer,
# )

# # Wrap the base query engine with the MultiStepQueryEngine
# multi_step_query_engine = MultiStepQueryEngine(
#     query_engine=base_query_engine,
#     query_transform=StepDecomposeQueryTransform(llm2, verbose= True),
#     num_steps=3,  # Number of decomposition steps
#     early_stopping=True,  # Stop if a satisfactory answer is found
# )

In [23]:
# Ask a question
response = query_engine.query("What is corticosteroid and how can i cook Homemade Crunchwrap Supreme??")
for token in response.response_gen:
    print(token, end='', flush=True)

<think>
Okay, so I need to figure out what a corticosteroid is and how to make a homemade Crunchwrap Supreme based on the provided context. Let me start by looking through each document and page mentioned.

First, for the question about corticosteroids, I'll check the Medical_book.pdf. On page 78, it mentions Cushing’s syndrome being caused by high levels of hydrocortisone, which is a corticosteroid. It also talks about Addison's disease involving deficiencies in these hormones. Another mention on page 222 discusses corticosteroid drugs used for inflammation and their side effects. So, putting that together, corticosteroids are a type of hormone, often used as medication to reduce inflammation.

Now, for the recipe, I'll look into the Student-Cookbook.pdf. On pages 21 and 40, there's a detailed recipe for Homemade Crunchwrap Supreme. It lists ingredients like large tortilla shells, taco meat, queso sauce, sour cream, lettuce, cheese, and optional toppings. The steps involve warming the

In [26]:
# Ask a question
response = query_engine.query("How can i cook Homemade Crunchwrap Supreme and what is corticosteroid?")
for token in response.response_gen:
    print(token, end='', flush=True)

<think>
Okay, so I need to figure out how to answer these two questions using only the provided context. The first question is about cooking "Homemade Crunchwrap Supreme," and the second is asking about corticosteroid. Let me go through each step carefully.

Starting with the first question: How can I cook Homemade Crunchwrap Supreme? Looking at the context, there's a recipe for this on page 21 of the Student-Cookbook.pdf. The steps are listed from 1 to 7, detailing how to prepare the tortilla, add fillings like queso sauce, taco meat, lettuce, cheese, sour cream, and optional ingredients. Then it explains cooking them in a pan by flipping and pressing with a spatula. So I can summarize those steps.

Now, the second question is about corticosteroid. I need to check if there's any mention of this term in the context provided. Scanning through all the pages (21, 40, 41, 29, 30, 45, 34), none of them discuss anything related to corticosteroids. The topics are about cooking recipes like Cr

In [8]:
# Ask a question
response = query_engine.query("Ingredients for Basic Egg Salad Sandwich?")
for token in response.response_gen:
    print(token, end='', flush=True)

<think>
Okay, so I need to figure out the ingredients for the Basic Egg Salad Sandwich based on the provided context. Let me go through each part of the context step by step.

First, looking at the context, I see several sections about different recipes and sandwich ideas. The user is asking specifically about the Basic Egg Salad Sandwich, so I should focus on that.

I notice that in the context, there's a detailed recipe for "Basic Egg Salad Sandwich" listed after the Nut Butter and Banana Open Sandwich. Let me check what ingredients are mentioned there.

The recipe starts with 3 boiled eggs. Then it mentions 1-3 tablespoons of mayonnaise, which seems optional depending on how moist you like the salad. Next is 1 teaspoon of mustard. The optional ingredients include celery (diced into small pieces), salt, pepper, and lettuce. The bread used is whole wheat.

So putting that together, the main ingredients are boiled eggs, mayonnaise, mustard, and optionally celery, salt, pepper, and lett

In [None]:
# Ask a question
response = query_engine.query("What is corticosteroid?")  
for token in response.response_gen:
    print(token, end='', flush=True)

<think>
Okay, so I need to figure out what a corticosteroid is based on the provided context. Let me go through each document and page mentioned to find any mentions of corticosteroids.

Starting with the first source: ..\data\Medical_book.pdf on page 319. It talks about various drugs, including cortisone, which is described as a steroid produced by the adrenal cortex in response to stress. Cortisone has anti-inflammatory and immunosuppressive properties. There's also mention of methotrexate and NSAIDs, but nothing specifically about corticosteroids beyond cortisone being one.

Looking at another part of the same document on page 66, it discusses Addison’s disease, which involves the adrenal cortex producing cortisol and aldosterone. Cortisol is a potent hormone involved in many bodily functions, including immune response. It also mentions that oral corticosteroids are used for severe acne but doesn't define what corticosteroids are.

In another section on page 78, it refers to Cushing

In [10]:
# Ask a question
response = query_engine.query("What is Tricyclic antidepressants?")   
for token in response.response_gen:
    print(token, end='', flush=True)

<think>
Okay, so I need to figure out what tricyclic antidepressants are based on the provided context. Let me start by reading through the context carefully.

From the context, I see that tricyclic antidepressants are medicines used to relieve mental depression. They were discovered in the 1950s and help reduce symptoms like sadness, hopelessness, and lack of energy. Some are also used for other conditions like bulimia and panic disorders.

The name "tricyclic" comes from their three-ring chemical structure. These drugs work by correcting chemical imbalances in the brain but can cause side effects because they affect other chemicals throughout the body.

Commonly mentioned drugs include amitriptyline (Elavil), desipramine (Norpramin), and others. They are prescription-only and come in various forms like tablets, capsules, liquids, and injections.

I also note that they interact with other medications, such as MAO inhibitors, and have specific precautions regarding allergies, pregnancy

In [None]:
response = query_engine.query("What is angarabebesi?")   
for token in response.response_gen:
    print(token, end='', flush=True)

<think>
Okay, so I need to figure out what "angarabebesi" is based on the provided context. Let me start by looking through each document and page mentioned.

First, I'll check the documents listed in the context. There are several sources: "..\data\Medical_book.pdf", another with a different file path but same name, and some other files like "Your Audiogram." and others with UUIDs as filenames.

Looking at the content from Medical_book.pdf on page 464, it talks about Bartonellosis, which is caused by Bartonella bacilliformis. It mentions acute and chronic forms, Oroya fever, Carrion's disease, and verruga peruana. There's no mention of "angarabebesi" here.

Next, on page 146 of the same Medical_book.pdf, it discusses altitude sickness (AMS, HAPE, HACE) but again, no reference to "angarabebesi".

Another document is about Auditory Integration Training, mentioning Dr. Alfred Tomatis and others, but nothing related to the term I'm looking for.

There's also a document titled "Tips for Ma

In [17]:
# Define your search query
query = "what is corticosteroid and How can i cook Homemade Crunchwrap Supreme"

# Retrieve relevant nodes
retrieved_nodes = bm25_retriever.retrieve(query)

# Display the retrieved nodes
for node in retrieved_nodes:
    print(f"Node ID: {node.node_id}")
    print(f"Similarity Score: {node.score}")
    print(f"Content: {node.get_content()}\n")
    

Node ID: 836231ce-1be5-48e8-9218-71c31933b08c
Similarity Score: 11.465726852416992
Content: 22 
 
 
 
Homemade Crunchwrap Supreme 
Large (burrito size) tortilla shells 
Small tostada shells (or corn tortilla, or tortilla chips) 
Queso sauce 
Pre-cooked taco meat 
Shredded lettuce 
Shredded Cheese 
Sour cream 
Optional ingredients: taco sauce, jalapenos, bell or banana peppers, black or refried beans, 
olives, tomatoes 
 
1. Warm your tortilla in the microwave to make it soft enough to bend. 
2. Put a dollop of queso sauce in the center of your tortilla, but be sure to leave a wide edge so 
you can wrap it. 
3. Spoon some taco meat on the cheese (beans too, if you want to add them). Place the 
tostada shell, corn tortilla, or tortilla chips on top of the meat. 
4. Spoon a dollop of sour cream and spread over the shell. Sprinkle some shredded lettuce and 
shredded cheese on top (this is where you would add tomatoes, etc.) 
5. Wrap the edges up toward the center until they close around yo