In [13]:

import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.schema import HumanMessage


In [2]:
load_dotenv()
api_key = os.getenv("GROQ_API_KEY")
if api_key:
    print("API Key loaded successfully.")
else:
    print("Failed to load API Key. Check your .env file.")

API Key loaded successfully.


In [None]:
# Step 1: Chunk a PDF document
def load_and_chunk_document(pdf_path):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    return text_splitter.split_documents(pages)

chunks = load_and_chunk_document("doc.pdf")  # Replace with your PDF
text_chunks = [doc.page_content for doc in chunks]  # Convert Documents to plain text


In [None]:
from sentence_transformers import SentenceTransformer

# Use an open-source model for embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for each chunk
embeddings = embedding_model.encode(text_chunks, convert_to_numpy=True)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import chromadb

# Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Create or get the collection
collection = chroma_client.get_or_create_collection(name="my_document_embeddings")

# Store embeddings in ChromaDB
for i, (chunk, embedding) in enumerate(zip(text_chunks, embeddings)):
    collection.add(
        ids=[f"chunk_{i}"],  # Unique ID for each chunk
        documents=[chunk],    # Store the actual text
        embeddings=[embedding.tolist()]  # Convert numpy array to list
    )

print("Embeddings stored successfully!")


Embeddings stored successfully!


In [7]:
# Connect to the existing ChromaDB collection
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="my_document_embeddings")

In [29]:
def retrieve_relevant_chunks(query, top_k=5):
    # Generate embedding for the query
    query_embedding = embedding_model.encode(query, convert_to_numpy=True).tolist()

    # Perform similarity search
    results = collection.query(
        query_embeddings=[query_embedding], 
        n_results=top_k
    )

    # Extract retrieved text chunks
    retrieved_chunks = results["documents"][0]  # Top retrieved chunks
    return retrieved_chunks


In [30]:
llm = ChatGroq(model="llama3-8b-8192", api_key = api_key)

In [33]:
def generate_answer(query, retrieved_chunks):
    # Format retrieved chunks into context
    context = "\n\n".join(retrieved_chunks)

    # Structured prompt using f-strings (correct way)
    prompt = f"""
You are an AI assistant that provides clear, concise, and informative answers based on provided context.

### Context:
{context}

### Question:
{query}

### Instructions:
- Summarize the relevant information from the context to answer the question.
- Provide a **clear, structured, and fact-based response**.
- If numerical data is present, ensure accuracy and present it in a readable format.
- If the context does not contain enough information, say **"The provided context does not contain sufficient details.
- Dont include anything except the answer. "**

### Response:
"""

    # Invoke the LLM
    response = llm.invoke(prompt)
    
    return response.content

In [34]:
# Example query
query = "Where is harishchandragad and how do we reach there?"
retrieved_chunks = retrieve_relevant_chunks(query)
response = generate_answer(query, retrieved_chunks)
print(retrieved_chunks)
print("...........................................................................................")
print("Generated Answer:\n", response)


['HARISHCHANDRAGAD \nHarishchandragad trek is one of the most challenging treks in the western ghats of Maharashtra. A \npopular trek which offers a variety of adventures to all kinds of trekkers. \nIt is a hill fort in the Ahmednagar district situated in the Malshej Ghat. It climbs up to an altitude of \n4,670 ft.  \nHarishchandragad is an ancient fort. Its origin is said to have been in the 6th century during the rule \nof the Kalchuri dynasty. But the caves which you see on the top are probably carved out in the 11th \ncentury.  \nThe various constructions on the fort and those in the surrounding region indicate the existence of \ndiverse cultures. Saptatheertha Pushkarni, Kedareshwar cave, Harishchandra temple and other \ncaves are the examples for that.  \nHarishchandragad trek has multiple routes and each route offers a surprise to the trekkers. It is also \na complete package of views once you reach the top. \n1. An overhanging Konkan Kada(cliff) that offers a majestic view of K