In [7]:
# Imports
import fitz  # PyMuPDF
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.schema import Document
import torch

# Step 1: Load PDF using PyMuPDF
def load_pdf(path):
    print(f"📄 Loading PDF from: {path}")
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return [Document(page_content=text)]

# Step 2: Split PDF into smaller chunks
def split_text(docs):
    print("✂️ Splitting PDF text into chunks...")
    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    return splitter.split_documents(docs)

# Step 3: Embed the chunks using SentenceTransformer
def embed_chunks(chunks):
    print("🔍 Creating embeddings and vector store...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(chunks, embeddings)
    return vector_store

# Step 4: Load lightweight LLM (Flan-T5 Base)
def load_llm():
    print("🧠 Loading Flan-T5 Base model...")
    model_id = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    return HuggingFacePipeline(pipeline=pipe)

# Step 5: Build the Retrieval QA chain
def create_pdf_qa_bot(pdf_path):
    docs = load_pdf(pdf_path)
    chunks = split_text(docs)
    vectorstore = embed_chunks(chunks)
    llm = load_llm()

    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )
    print("✅ PDF QA Bot ready!")
    return qa_chain

# Step 6: Ask questions interactively (corrected for Jupyter)
def chat_with_pdf():
    pdf_path = input("📂 Enter the path to your PDF file: ").strip()

    if not os.path.exists(pdf_path):
        print(f"❌ File not found: {pdf_path}")
        return

    try:
        bot = create_pdf_qa_bot(pdf_path)
    except Exception as e:
        print(f"❌ Error setting up the bot: {e}")
        return

    print("\n💬 Ask questions about your PDF (type 'exit' to quit):")
    while True:
        question = input("You: ")
        if question.lower() == "exit":
            print("👋 Goodbye!")
            break
        try:
            result = bot.invoke({"query": question})
            print(f"\n🤖 Answer: {result['result']}\n")
            for i, doc in enumerate(result["source_documents"]):
                print(f"📄 Source {i+1}: {doc.page_content[:150]}...\n")
        except Exception as e:
            print(f"⚠️ Error during answer generation: {e}")

# 🔥 Run it
chat_with_pdf()


📂 Enter the path to your PDF file:  C:\Users\nares\Downloads\Resume.pdf


📄 Loading PDF from: C:\Users\nares\Downloads\Resume.pdf
✂️ Splitting PDF text into chunks...
🔍 Creating embeddings and vector store...
🧠 Loading Flan-T5 Base model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  10%|9         | 94.4M/990M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


✅ PDF QA Bot ready!

💬 Ask questions about your PDF (type 'exit' to quit):


You:  What are the skills of Naresh ?



🤖 Answer: coding, problem-solving, and technology

📄 Source 1: EDUCATION
NARESH
V
UNDERGRADUATE
CONTACT
6384520705
nareshvelu366@gmail.com
5 School St. Ammoor
Ranipet Tamilnadu
VIYANIKETAN MATRIC HR
HSC Exam - 97....



You:  exit


👋 Goodbye!
