### Import Modules

In [11]:
import PyPDF2
import chromadb

from google import genai
from google.genai import types

import re
import os
import random
import json
import uuid

### Initialize Client

In [12]:
GOOGLE_API_KEY=os.environ.get('GOOGLE_API_KEY')
pdf="data/datasheet.pdf"

In [13]:
chroma_client = chromadb.Client()

### Index the Document (PDF)

In [14]:
def extract_sentences_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            if page.extract_text() is not None:
                text += page.extract_text() + " "
    sentences = [sentence.strip() for sentence in text.split('. ') if sentence.strip()]
    return sentences

In [15]:
def generate_text_embeddings(sentences:list):
    client = genai.Client(api_key=GOOGLE_API_KEY)
    result=client.models.embed_content(
      model="text-embedding-004",contents=sentences,config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY"))
    embeddings = result.embeddings
    vectors = [embedding.values for embedding in embeddings]
    return vectors

In [16]:
def generate_and_save_embeddings(pdf_path):
    sentences = extract_sentences_from_pdf(pdf_path)
    embeddings = generate_text_embeddings(sentences)
    
    # Create a collection
    collection = chroma_client.create_collection(name="my_collection", get_or_create=True)
    
    # Generate unique IDs for each embedding
    ids = [str(uuid.uuid4()) for _ in range(len(embeddings))]
    
    # Add embeddings with metadata
    collection.add(
        embeddings=embeddings,
        ids=ids,
        documents=sentences,
        metadatas=[{"source": pdf_path} for _ in range(len(embeddings))]
    )
    
    return collection

In [17]:
collection = generate_and_save_embeddings(pdf)

### Perform Retrieval

In [18]:
def rag_query(query, collection, k=5):
    # Step 1: Embed the query
    query_embedding = generate_text_embeddings([query])[0]
    # Step 2: Retrieve top-k relevant sentences from ChromaDB
    results = collection.query(query_embeddings=[query_embedding], n_results=k)
    docs = results.get('documents', [[]])[0]
    if not docs:
        context = "No relevant context found."
    else:
        context = "\n".join(docs)
    # Step 3: Compose prompt for Gemini
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    # Step 4: Generate answer using Gemini 2.0 Flash
    client = genai.Client(api_key=GOOGLE_API_KEY)
    response = client.models.generate_content(
        model="gemini-2.0-flash-001",
        contents=prompt,
        config=types.GenerateContentConfig(max_output_tokens=1000),
    )
    return response.text if hasattr(response, 'text') else response

In [19]:
test_query = "Whats the battery life of the EchoSound Max on single charge?"
answer = rag_query(test_query, collection)
print(f"\nRAG Answer to: '{test_query}'\n{answer}")


RAG Answer to: 'Whats the battery life of the EchoSound Max on single charge?'
This document does not mention the battery life of the EchoSound Max.



In [None]:
test_query = "What's the size of the SSD in Nimbus Book?"
answer = rag_query(test_query, collection)
print(f"\nRAG Answer to: '{test_query}'\n{answer}")