<a href="https://colab.research.google.com/github/ketakiraut34/AdaBoost/blob/main/RAG_with_Gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install libraries
!pip install langchain chromadb pypdf google-generativeai sentence_transformers Ipython

Collecting chromadb
  Downloading chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting pypdf
  Downloading pypdf-5.5.0-py3-none-any.whl.metadata (7.2 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.2.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.33.1-py3-none-any.whl.metadata (1.6 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.33.1-py3-none-any.whl.metadata (2.

#### Import necessary libraries

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from pypdf import PdfReader
import google.generativeai as genai
from pprint import pprint
from google.colab import userdata
from IPython.display import Markdown, display

In [None]:
# Create a PdfReader object to read the PDF file
reader = PdfReader("/content/alphabet annual report.pdf")

# Extract text from each page in the PDF and strip any leading/trailing whitespace
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter out any empty strings from the extracted texts
pdf_texts = [text for text in pdf_texts if text]

# Pretty-print the text from the first page of the PDF
pprint(pdf_texts[0])

In [None]:
# Create a RecursiveCharacterTextSplitter object with specified separators, chunk size, and chunk overlap
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],  # List of separators for splitting the text
    chunk_size=1000,  # Maximum size of each text chunk
    chunk_overlap=0  # Number of characters to overlap between chunks
)

# Join the extracted PDF texts with '\n\n' and split the combined text into chunks
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

# Pretty-print the text of the 11th chunk (index 10) of the split text
pprint(character_split_texts[10])

# Print the total number of chunks created
print(f"\nTotal chunks: {len(character_split_texts)}")

In [None]:
# Create a SentenceTransformersTokenTextSplitter object with specified chunk overlap and tokens per chunk
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

# Initialize an empty list to hold the token-split texts
token_split_texts = []

# Loop through each chunk in the character-split texts
for text in character_split_texts:
    # Split the text into smaller chunks using the token splitter and add them to the token_split_texts list
    token_split_texts += token_splitter.split_text(text)

# Print the wrapped text of the 11th chunk (index 10) of the token-split text
pprint(token_split_texts[10])

# Print the total number of token-split chunks created
print(f"\nTotal chunks: {len(token_split_texts)}")

In [None]:
# Create a SentenceTransformerEmbeddingFunction object
embedding_function = SentenceTransformerEmbeddingFunction()

# Generate embeddings for the 11th chunk (index 10) of the token-split text and print the result
print(embedding_function([token_split_texts[10]]))

In [None]:
# Create a ChromaDB client
chroma_client = chromadb.Client()

# Create a new collection in ChromaDB with the name "Alphabet Annual Report" and the specified embedding function
chroma_collection = chroma_client.create_collection("alphabet_annual_report", embedding_function=embedding_function)

# Generate a list of string IDs corresponding to the number of token-split text chunks
ids = [str(i) for i in range(len(token_split_texts))]

# Add the token-split text chunks to the ChromaDB collection using the generated IDs
chroma_collection.add(ids=ids, documents=token_split_texts)

# Count and return the number of documents in the ChromaDB collection
chroma_collection.count()

In [None]:
# Step 1: Retrieve the API key from user data
GEMINI_API_KEY = userdata.get('API_KEY')  # Get API Key from Secrets

# Step 2: Configure the GenAI client with the retrieved API key
genai.configure(api_key=GEMINI_API_KEY)

# Step 3: Define the generation configuration for the model
generation_config = {
    "temperature": 0.9,       # Controls the randomness of the output (higher values mean more random)
    "top_p": 1,               # Controls nucleus sampling (1 means no filtering)
    "top_k": 1,               # Controls the number of highest probability tokens to consider (1 means only the highest)
    "max_output_tokens": 2048 # Maximum number of tokens in the output
}

# Step 4: Initialize the generative model with the specified name and configuration
model = genai.GenerativeModel(
    model_name="gemini-2.0-flash",       # Name of the model
    generation_config=generation_config  # Configuration for text generation
)

In [None]:
def rag(query, retrieved_documents):
    # Combine the retrieved documents into a single string, separated by double newlines
    information = "\n\n".join(retrieved_documents)

    # Create the message for the generative model, providing context and the user's query
    messages = [
        "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        f"Question: {query}. \n Information: {information}"
    ]

    # Generate a response using the configured generative model
    response = model.generate_content(messages)

    # Return the text part of the first candidate's response
    return response.candidates[0].content.parts[0].text

In [None]:
# Step 1: Define the query string
query = "What are some major revenues coming from?"

# Step 2: Query the ChromaDB collection with the specified query string, retrieving the top 3 results
results = chroma_collection.query(query_texts=[query], n_results=3)

# Step 3: Extract the list of retrieved documents from the query results
retrieved_documents = results['documents'][0]

# Step 4: Loop through each retrieved document, print the wrapped text, and add a newline for readability
for document in retrieved_documents:
    pprint(document)
    print('\n')


In [None]:
# Generate the response using the RAG function with the provided query and retrieved documents
output = rag(query=query, retrieved_documents=retrieved_documents)

# Print the generated response
display(Markdown(output))