In [None]:
!pip install pypdf[crypto]

In [None]:
!pip install chromadb

In [None]:
!pip install sentence_transformers

In [None]:
!pip install --upgrade gradio

## **Importing required packages**

In [None]:
# Libraries for loading and parsing the pdf book
from pypdf import PdfReader
import os

# For creating a database for storing embeddings
import chromadb
from chromadb.utils import embedding_functions

# For structuring the instructions to be passed to LLM
from transformers import pipeline

# For Retrieving & Storing the HF token
from huggingface_hub.hf_api import HfFolder
from google.colab import userdata

# For User Interface
import gradio as gr

# For response articulation via LLM
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

## **PDF parser**

In [None]:
def load_documents(file_path):

    """
    Loads the pdf file and returns the text.

    Parameters:
        file_path (string): file path as input of Biology pdf book

    Returns:
        string: chapter 4 & chapter 5 as a concatenated string
    """


    dict_page = {}
    reader = PdfReader(file_path)
    number_of_pages = len(reader.pages)

    page_count = 0
    for page in range(number_of_pages):
        # took the page numbers via manual inspection
        text = ""
        if page_count >= 102 and page_count <= 146:
            page = reader.pages[page_count]
            text +=page.extract_text() + "/n"
            dict_page[page_count] = text
        page_count += 1
    return dict_page

file_path = (r'/content/ConceptsofBiology-WEB.pdf')

## **Creating chunks from the text corpus**

In [None]:
def chunking(content: dict, chunk_size: int = 500):
    """
    Chunk the corpus into smaller documents.

    Parameters:
        content (dict): concatenated text of two chapters
        chunk_size (int): size of each chunk in words

    Returns:
        list: equal sized chunks of text
    """
    chunks = {}  # Initialize chunks as a dictionary

    for key, value in content.items():  # Iterate through key-value pairs
        sentences = value.replace('\n', ' ').split('. ')  # Replace and split on the value

        chunks[key] = []  # Initialize a list for chunks for this key
        current_chunk = []
        current_size = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Ensure proper sentence ending
            if not sentence.endswith('.'):
                sentence += '.'

            sentence_size = len(sentence)

            # Check if adding this sentence would exceed chunk size
            if current_size + sentence_size > chunk_size and current_chunk:
                chunks[key].append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_size = sentence_size
            else:
                current_chunk.append(sentence)
                current_size += sentence_size

        # Add the last chunk if it exists
        if current_chunk:
            chunks[key].append(' '.join(current_chunk))

    return chunks

In [None]:
chunk = chunking(load_documents(file_path))
print(f'We have {len(chunk)} documents')

In [None]:
def process_document(file_path: str):

    """
    Process a single document and prepare it for ChromaDB.

    Parameters:
        file_path (string): file path as input of Biology pdf book

    Returns:
        int: id of the document
        string: text of the document
        dict: metadata of the document
    """

    # Read the document
    content = load_documents(file_path)

    # Split into chunks
    chunks = chunking(content)
    chunks_collection = []
    metadatas = []
    ids = []
    file_name = os.path.basename(file_path)

    for key, value in chunks.items():

    # Prepare metadata
        for i, chunk in enumerate(value):
            chunks_collection.append(chunk)
            ids.append(f"{file_name}_page_{key}_chunk_{i}")
            metadatas.append({"source": file_name, "page": key, "chunk": i})

    return ids, chunks_collection, metadatas

## **Setting up vector database (ChromaDB) and storing text embeddings**

In [None]:
# Initialize ChromaDB client with persistence
client = chromadb.PersistentClient(path="chroma_db")

# Configure sentence transformer embeddings
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create or get existing collection
collection = client.get_or_create_collection(
    name="documents_collection",
    embedding_function=sentence_transformer_ef
)

In [None]:
def add_to_collection(collection, ids, texts, metadatas):

    """
    Add documents to ChromaDB in batches.

    Parameters:
        file_path (string): file path as input of Biology pdf book

    Returns:
        int: id of the document
        string: text of the document
        dict: metadata of the document
    """

    if not texts:
        return

    batch_size = 100
    for i in range(0, len(texts), batch_size):
        end_idx = min(i + batch_size, len(texts))
        collection.add(
            documents=texts[i:end_idx],
            metadatas=metadatas[i:end_idx],
            ids=ids[i:end_idx]
        )

In [None]:
# Calling functions to parse the pdf and then upload the data to ChromaDB

ids, texts, metadatas = process_document(file_path)
add_to_collection(collection, ids, texts, metadatas)

## **Carrying out Semantic search on the vector database**

In [None]:
def semantic_search(query: str, collection, n_results: int = 2):

    """
    Perform semantic search on the collection within ChromaDB.

    Parameters:
        collection: vector embeddings of the text corpus
        query (string): question asked by the user
        n_results (integer): top k semantic matches to be considered for answer

    Returns:
        dict: response generated comprising of text and metadata info
    """

    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )

    return results

def get_context_with_sources(results):

    """
    Extract context and source information from search results.

    Parameters:
        results (dict): response generated comprising of text and metadata info

    Returns:
        string: response collected by appending the top k matches
        dict: metadata info about the source name and chunk number of semantic match
    """

    # Combine document chunks into a single context
    context = "\n\n".join(results['documents'][0])

    # Format sources with metadata
    sources = [
        f"{meta['source']} (page {meta['page']}) (chunk {meta['chunk']})"
        for meta in results['metadatas'][0]
    ]

    return context, sources

## **Loading LLM from HF**

In [None]:
# Saving HF token via secret's feature of Colab

my_hf_key = userdata.get('RAG_llama3.2_proj')
HfFolder.save_token(my_hf_key)

In [None]:
# Storing LLM and tokenizer

model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## **Prompt creation for LLM to articulate the final response**

In [None]:
def get_prompt(context: str, query: str):

    prompt_template =  f'''
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>{"You are given a context and a question. As a helpful assistant you need to answer question from the context below."}

    {context}

    {"Provide a conversational answer with brevity.If answer is not in the context, return 'I do not know'."}<|eot_id|>

    <|start_header_id|>user<|end_header_id|>{query}<|eot_id|>

    <|start_header_id|>assistant<|end_header_id|>'''

    return prompt_template

In [None]:
def generate_response(query: str, context: str):

    """
    Creat prompt using user query and semantic search for passing into LLM.

    Parameters:
        query (string): question asked by the user
        context (string): response collected by appending the top k matches

    Returns:
        dict: comprising of prompt (system instructions, user query) and  final response
    """

    prompt = get_prompt(context, query)
    # print(prompt)

    pipe = pipeline(
        "text-generation",
        model=model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        tokenizer = tokenizer,
        return_full_text = False
    )
    messages = [
                {"role": "user", "content": prompt}
            ],

    outputs = pipe(
        messages,
        max_new_tokens=4096,
        pad_token_id=tokenizer.eos_token_id
    )
    return(outputs[0][-1])

In [None]:
def rag_query(query: str, collection = collection, top_k: int = 4):

    """
    RAG query function to retrieve relevant chunks and generate answer.

    Parameters:
        collection: vector embeddings of the text corpus
        query (string): question asked by the user
        top_k (integer): top k semantic matches to be considered for answer

    Returns:
        string: final articulated response to the user query
        dict: list of source and respective chunk id from metadata
    """

    # Get relevant chunks
    results = semantic_search(query, collection, top_k)
    context, sources = get_context_with_sources(results)

    # Generate response
    response = generate_response(query, context)

    if response['generated_text'] == 'I do not know.':
        sources = ["Nil"] # getting jibberish in sources therefore adding this hot fix
    nl = '\n'
    final_response = f"{response['generated_text']}\n\nSources: {nl}{nl.join(sources)}"
    # return response['generated_text'], sources
    return final_response

## **LLM based RAG Output**

In [None]:
query = "What is beta oxidation?"
final_response = rag_query(query, collection)

# Print results
print("\nQuery:", query)
print("\nAnswer:", final_response)

In [None]:
query = "Who is the president of India"
final_response = rag_query(query, collection)

# Print results
print("\nQuery:", query)
print("\nAnswer:", final_response)

In [None]:
query = "What is the energy currency used by cels?"
final_response = rag_query(query, collection)

# Print results
print("\nQuery:", query)
print("\nAnswer:", final_response)

## **Gradio - WIP**

In [None]:
# Launching user interface using Gradio

from functools import partial

with gr.Blocks() as interface:
    gr.Markdown("<center><h1>Biology Assistant")

    query = gr.Textbox(label = "Please type your question here -", lines = 3)

    submit_btn = gr.Button("Submit", variant = "primary", size = 'sm')

    # with gr.Row() as output_row:
    #         llama_output = gr.Markdown("Response")
    llama_output = gr.Markdown("Response will appear here", label="Model Response")

    # Event binding
    submit_btn.click(
        partial(rag_query, collection=collection),  # Function to process the input
        inputs=query,  # Input component
        outputs=llama_output  # Output component
    )

interface.launch(debug = True)