In [1]:
!pip install pypdf[crypto]

Collecting pypdf[crypto]
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


In [2]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.23-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.33.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.4-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3

In [3]:
!pip install sentence_transformers



In [4]:
!pip install --upgrade gradio

Collecting gradio
  Downloading gradio-5.9.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.2 (from gradio)
  Downloading gradio_client-1.5.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.met

## **Importing required packages**

In [5]:
# Libraries for loading and parsing the pdf book
from pypdf import PdfReader
import os

# For creating a database for storing embeddings
import chromadb
from chromadb.utils import embedding_functions

# For structuring the instructions to be passed to LLM
from transformers import pipeline

# For Retrieving & Storing the HF token
from huggingface_hub.hf_api import HfFolder
from google.colab import userdata

# For User Interface
import gradio as gr

# For response articulation via LLM
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

## **PDF parser**

In [6]:
def load_documents(file_path):

    """
    Loads the pdf file and returns the text.

    Parameters:
        file_path (string): file path as input of Biology pdf book

    Returns:
        string: chapter 4 & chapter 5 as a concatenated string
    """


    dict_page = {}
    reader = PdfReader(file_path)
    number_of_pages = len(reader.pages)

    page_count = 0
    for page in range(number_of_pages):
        # took the page numbers via manual inspection
        text = ""
        if page_count >= 102 and page_count <= 146:
            page = reader.pages[page_count]
            text +=page.extract_text() + "/n"
            dict_page[page_count] = text
        page_count += 1
    return dict_page

file_path = (r'/content/ConceptsofBiology-WEB.pdf')

## **Creating chunks from the text corpus**

In [7]:
def chunking(content: dict, chunk_size: int = 500):
    """
    Chunk the corpus into smaller documents.

    Parameters:
        content (dict): concatenated text of two chapters
        chunk_size (int): size of each chunk in words

    Returns:
        list: equal sized chunks of text
    """
    chunks = {}  # Initialize chunks as a dictionary

    for key, value in content.items():  # Iterate through key-value pairs
        sentences = value.replace('\n', ' ').split('. ')  # Replace and split on the value

        chunks[key] = []  # Initialize a list for chunks for this key
        current_chunk = []
        current_size = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            # Ensure proper sentence ending
            if not sentence.endswith('.'):
                sentence += '.'

            sentence_size = len(sentence)

            # Check if adding this sentence would exceed chunk size
            if current_size + sentence_size > chunk_size and current_chunk:
                chunks[key].append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_size = sentence_size
            else:
                current_chunk.append(sentence)
                current_size += sentence_size

        # Add the last chunk if it exists
        if current_chunk:
            chunks[key].append(' '.join(current_chunk))

    return chunks

In [10]:
chunk = chunking(load_documents(file_path))
print(f'We have {len(chunk)} documents')

We have 45 documents


In [11]:
def process_document(file_path: str):

    """
    Process a single document and prepare it for ChromaDB.

    Parameters:
        file_path (string): file path as input of Biology pdf book

    Returns:
        int: id of the document
        string: text of the document
        dict: metadata of the document
    """

    # Read the document
    content = load_documents(file_path)

    # Split into chunks
    chunks = chunking(content)
    chunks_collection = []
    metadatas = []
    ids = []
    file_name = os.path.basename(file_path)

    for key, value in chunks.items():

    # Prepare metadata
        for i, chunk in enumerate(value):
            chunks_collection.append(chunk)
            ids.append(f"{file_name}_page_{key}_chunk_{i}")
            metadatas.append({"source": file_name, "page": key, "chunk": i})

    return ids, chunks_collection, metadatas

## **Setting up vector database (ChromaDB) and storing text embeddings**

In [12]:
# Initialize ChromaDB client with persistence
client = chromadb.PersistentClient(path="chroma_db")

# Configure sentence transformer embeddings
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create or get existing collection
collection = client.get_or_create_collection(
    name="documents_collection",
    embedding_function=sentence_transformer_ef
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
def add_to_collection(collection, ids, texts, metadatas):

    """
    Add documents to ChromaDB in batches.

    Parameters:
        file_path (string): file path as input of Biology pdf book

    Returns:
        int: id of the document
        string: text of the document
        dict: metadata of the document
    """

    if not texts:
        return

    batch_size = 100
    for i in range(0, len(texts), batch_size):
        end_idx = min(i + batch_size, len(texts))
        collection.add(
            documents=texts[i:end_idx],
            metadatas=metadatas[i:end_idx],
            ids=ids[i:end_idx]
        )

In [14]:
# Calling functions to parse the pdf and then upload the data to ChromaDB

ids, texts, metadatas = process_document(file_path)
add_to_collection(collection, ids, texts, metadatas)

## **Carrying out Semantic search on the vector database**

In [15]:
def semantic_search(query: str, collection, n_results: int = 2):

    """
    Perform semantic search on the collection within ChromaDB.

    Parameters:
        collection: vector embeddings of the text corpus
        query (string): question asked by the user
        n_results (integer): top k semantic matches to be considered for answer

    Returns:
        dict: response generated comprising of text and metadata info
    """

    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )

    return results

def get_context_with_sources(results):

    """
    Extract context and source information from search results.

    Parameters:
        results (dict): response generated comprising of text and metadata info

    Returns:
        string: response collected by appending the top k matches
        dict: metadata info about the source name and chunk number of semantic match
    """

    # Combine document chunks into a single context
    context = "\n\n".join(results['documents'][0])

    # Format sources with metadata
    sources = [
        f"{meta['source']} (page {meta['page']}) (chunk {meta['chunk']})"
        for meta in results['metadatas'][0]
    ]

    return context, sources

## **Loading LLM from HF**

In [16]:
# Saving HF token via secret's feature of Colab

my_hf_key = userdata.get('RAG_llama3.2_proj')
HfFolder.save_token(my_hf_key)

In [17]:
# Storing LLM and tokenizer

model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

## **Prompt creation for LLM to articulate the final response**

In [18]:
def get_prompt(context: str, query: str):

    prompt_template =  f'''
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>{"You are given a context and a question. As a helpful assistant you need to answer question from the context below."}

    {context}

    {"Provide a conversational answer with brevity.If answer is not in the context, return 'I do not know'."}<|eot_id|>

    <|start_header_id|>user<|end_header_id|>{query}<|eot_id|>

    <|start_header_id|>assistant<|end_header_id|>'''

    return prompt_template

In [19]:
def generate_response(query: str, context: str):

    """
    Creat prompt using user query and semantic search for passing into LLM.

    Parameters:
        query (string): question asked by the user
        context (string): response collected by appending the top k matches

    Returns:
        dict: comprising of prompt (system instructions, user query) and  final response
    """

    prompt = get_prompt(context, query)
    # print(prompt)

    pipe = pipeline(
        "text-generation",
        model=model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        tokenizer = tokenizer,
        return_full_text = False
    )
    messages = [
                {"role": "user", "content": prompt}
            ],

    outputs = pipe(
        messages,
        max_new_tokens=4096,
        pad_token_id=tokenizer.eos_token_id
    )
    return(outputs[0][-1])

In [40]:
def rag_query(query: str, collection = collection, top_k: int = 4):

    """
    RAG query function to retrieve relevant chunks and generate answer.

    Parameters:
        collection: vector embeddings of the text corpus
        query (string): question asked by the user
        top_k (integer): top k semantic matches to be considered for answer

    Returns:
        string: final articulated response to the user query
        dict: list of source and respective chunk id from metadata
    """

    # Get relevant chunks
    results = semantic_search(query, collection, top_k)
    context, sources = get_context_with_sources(results)

    # Generate response
    response = generate_response(query, context)

    if response['generated_text'] == 'I do not know.':
        sources = ["Nil"] # getting jibberish in sources therefore adding this hot fix
    nl = '\n'
    final_response = f"{response['generated_text']}\n\nSources: {nl}{nl.join(sources)}"
    # return response['generated_text'], sources
    return final_response

## **LLM based RAG Output**

In [41]:
query = "What is beta oxidation?"
final_response = rag_query(query, collection)

# Print results
print("\nQuery:", query)
print("\nAnswer:", final_response)


Query: What is beta oxidation?

Answer: Beta oxidation is a type of metabolic pathway that breaks down fatty acids into acetyl-CoA units. It's an aerobic pathway, meaning it requires oxygen to proceed, and it's a crucial step in the citric acid cycle, also known as the Krebs cycle or TCA cycle.

In beta oxidation, a fatty acid is broken down into acetyl-CoA units by the enzyme acyl-CoA dehydrogenase. This process involves the removal of two carbon atoms from the fatty acid, resulting in two acetyl-CoA molecules. Each turn of the cycle releases two carbon dioxide molecules.

Think of it like a conveyor belt: the fatty acid is fed into the beta oxidation pathway, and the acetyl-CoA units are produced, one at a time, while carbon dioxide is released. This process is repeated multiple times, resulting in the formation of a large number of acetyl-CoA units, which can be fed into the citric acid cycle for further breakdown.

Beta oxidation is an essential process for energy production in ce

In [42]:
query = "Who is the president of India"
final_response = rag_query(query, collection)

# Print results
print("\nQuery:", query)
print("\nAnswer:", final_response)


Query: Who is the president of India

Answer: I do not know.

Sources: 
Nil


In [39]:
query = "What is the energy currency used by cels?"
final_response = rag_query(query, collection)

# Print results
print("\nQuery:", query)
print("\nAnswer:", final_response)


Query: What is the energy currency used by cels?

Answer: Response: The energy currency used by cells is ATP, or adenosine triphosphate. It's a molecule that contains the potential for a quick burst of energy that cells can use to perform various functions, like powering their work and reactions.

Sources: ConceptsofBiology-WEB.pdf (page 104) (chunk 1)
ConceptsofBiology-WEB.pdf (page 107) (chunk 7)
ConceptsofBiology-WEB.pdf (page 113) (chunk 1)
ConceptsofBiology-WEB.pdf (page 105) (chunk 4)


## **Gradio - WIP**

In [43]:
# Launching user interface using Gradio

from functools import partial

with gr.Blocks() as interface:
    gr.Markdown("<center><h1>Biology Assistant")

    query = gr.Textbox(label = "Please type your question here -", lines = 3)

    submit_btn = gr.Button("Submit", variant = "primary", size = 'sm')

    # with gr.Row() as output_row:
    #         llama_output = gr.Markdown("Response")
    llama_output = gr.Markdown("Response will appear here", label="Model Response")

    # Event binding
    submit_btn.click(
        partial(rag_query, collection=collection),  # Function to process the input
        inputs=query,  # Input component
        outputs=llama_output  # Output component
    )

interface.launch(debug = True)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d6efb79b3457b78af5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://d6efb79b3457b78af5.gradio.live


