# 1. Imports and Environment Setup


In [45]:
import os
from dotenv import load_dotenv
load_dotenv()

groq_api = os.getenv("FELLOWSHIP_GROQ_KEY")
pinecone_api = os.getenv("PINECONE_API_KEY")
pdf_path = "text.pdf"
index_name = "pakistan-history"
dimension = 384

if not pinecone_api:
    raise ValueError("Set PINECONE_API_KEY in your .env")
if not groq_api:
    raise ValueError("Set FELLOWSHIP_GROQ_KEY in your .env")
if not os.path.exists(pdf_path):
    raise ValueError(f"PDF file not found: {pdf_path}")

# 2. Library Imports


In [46]:
import fitz
import pandas as pd
from tqdm.auto import tqdm

from langchain_groq import ChatGroq
from langchain.schema import SystemMessage, AIMessage, HumanMessage

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_pinecone import PineconeVectorStore
from pinecone import AwsRegion, CloudProvider, Pinecone, ServerlessSpec, Metric


# 3. Initialize LLM (Groq)


In [47]:
chat=ChatGroq(
    groq_api_key=groq_api,
    model_name="Llama-3.3-70B-Versatile"
)

# 4. Load and Split PDF into Chunks

In [48]:
def load_pdf_split(path, chunk_size=500, chunk_overlap=100):
    text=""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text+=page.get_text("text") + "\n"
    splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=chunk_overlap)
    chunks=splitter.split_text(text)
    return chunks

print("Loading and splitting PDF...")
chunks=load_pdf_split(pdf_path)
data=pd.DataFrame({"chunks":chunks})
print(f"Loaded and split PDF into {len(data)} chunks.")
            

Loading and splitting PDF...
Loaded and split PDF into 3 chunks.


# 5. Pinecone Setup


In [49]:
pc = Pinecone(api_key=pinecone_api)
index_name="pakistan-history"
# Delete old index if exists
if index_name in [i["name"] for i in pc.list_indexes()]:
    pc.delete_index(index_name)

# Create new index
pc.create_index(
    name=index_name,
    metric=Metric.DOTPRODUCT,
    dimension=dimension,
    spec=ServerlessSpec(
        cloud=CloudProvider.AWS,
        region=AwsRegion.US_EAST_1
    )
)
index = pc.Index(index_name)


# 6. Embedding Model


In [50]:
embed_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# 7. Upload Data to Pinecone


In [51]:
# if index has no vectors, upload; otherwise skip to avoid duplicates.
stats=index.describe_index_stats()
total_vectors = stats.get("namespaces", {}).get("", {}).get("vector_count", 0) if stats else 0


In [52]:
if total_vectors == 0:
    print("Index is empty — uploading chunks...")
    batch_size = 100
    for i in tqdm(range(0, len(data), batch_size)):
        i_end = min(len(data), i + batch_size)
        batch = data.iloc[i:i_end]
        ids = [f"chunk-{j}" for j in range(i, i_end)]
        texts = batch["chunks"].tolist()
        embeds = embed_model.embed_documents(texts)
        metadata = [{"text": t} for t in texts]
        # Pinecone expects iterable of (id, vector, metadata)
        index.upsert(vectors=zip(ids, embeds, metadata))
    print("Upload complete.")
else:
    print(f"Index already contains vectors ({total_vectors}) — skipping upload.")


Index is empty — uploading chunks...


100%|██████████| 1/1 [00:00<00:00,  1.02it/s]

Upload complete.





# 8. VectorStore and Retriever


In [53]:
text_field = "text" 

vectorstore=PineconeVectorStore(
    index=index, 
    embedding=embed_model,
    text_key="text"
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


# 9. Chat Setup


In [54]:
system_prompt = SystemMessage(
    content="You are a helpful assistant. Use the provided document context to answer user queries. "
            "If the answer is not in the context, say you don't know."
)
message_history = [system_prompt]

# 10. Helper Functions


In [55]:
def get_context_for_query(query: str, k: int = 3) -> str:
    docs = retriever.get_relevant_documents(query) 
    # docs = vectorstore.similarity_search(query, k=k)

    texts = [d.page_content for d in docs]
    return "\n\n".join(texts)

def build_user_prompt_with_context(query: str) -> HumanMessage:
    context = get_context_for_query(query)
    augmented = (
        f"Context:\n{context}\n\n"
        f"Question:\n{query}\n\n"
        "Answer using ONLY the context above. "
        "If the context does not contain the answer, say 'I don't know from the provided documents.'"
    )
    return HumanMessage(content=augmented)

# 11. Conversational Loop


In [56]:
print("\nConversational RAG Chatbot ready!")
print("Ask anything about the PDF. Type 'exit' or 'quit' to stop.\n")

while True:
    try:
        user_input = input("You: ").strip()
    except KeyboardInterrupt:
        print("\nExiting...")
        break

    if not user_input:
        continue
    if user_input.lower() in ("exit", "quit", "bye"):
        print("Goodbye")
        break

    user_msg = build_user_prompt_with_context(user_input)
    message_history.append(user_msg)

    # send to Groq model
    response = chat(message_history)
    bot_text = response.content if hasattr(response, "content") else str(response)

    print("Bot:", bot_text)
    message_history.append(AIMessage(content=bot_text))



Conversational RAG Chatbot ready!
Ask anything about the PDF. Type 'exit' or 'quit' to stop.

Bot: Liaquat Ali Khan became the first Prime Minister of Pakistan.
Bot: Pakistan was created on August 14, 1947, after the partition of British India. The country experienced periods of democracy and military rule throughout its history. In 1956, Pakistan adopted its first constitution, officially becoming the Islamic Republic of Pakistan. Today, Pakistan is known for its rich culture, strong military, and significant role in South Asian politics. Pakistan faced major challenges from the beginning, including a shortage of resources, migration of millions of refugees, and administrative difficulties. Despite these hardships, Pakistan quickly established its government institutions.
Goodbye
