In [3]:
# 📄 PDF Question Answering System - Complete in One Cell
# Just copy, paste, and run in Google Colab!

# Install dependencies
print("🔧 Installing dependencies...")
import subprocess
import sys
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

install("langchain")
install("langchain-community")
install("chromadb")
install("sentence-transformers")
install("pypdf")
install("gradio")

# Import libraries
import os
import tempfile
from typing import List
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import warnings
warnings.filterwarnings('ignore')

print("Setup complete!")

# PDF RAG System Class
class PDFChatSystem:
    def __init__(self):
        print("Initializing PDF Chat System...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        self.vectorstore = None
        self.chat_history = []
        print("System ready!")

    def process_pdf(self, pdf_file):
        """Process uploaded PDF file"""
        if pdf_file is None:
            return "Please upload a PDF file first!"

        try:
            # Load PDF
            loader = PyPDFLoader(pdf_file.name)
            documents = loader.load()

            if not documents:
                return " Could not extract text from PDF. Please check the file."

            # Split into chunks
            texts = self.text_splitter.split_documents(documents)

            # Create vector store
            self.vectorstore = Chroma.from_documents(
                documents=texts,
                embedding=self.embeddings
            )

            # Reset chat history for new document
            self.chat_history = []

            return f"PDF processed successfully! \n Extracted {len(texts)} text chunks \n Ready for questions!"

        except Exception as e:
            return f" Error processing PDF: {str(e)}"

    def answer_question(self, question, history):
        """Answer question about the PDF"""
        if not question.strip():
            return history, ""

        if self.vectorstore is None:
            history.append([question, "Please upload a PDF file first!"])
            return history, ""

        try:
            # Search for relevant chunks
            docs = self.vectorstore.similarity_search(question, k=3)

            if not docs:
                history.append([question, " No relevant information found in the PDF."])
                return history, ""

            # Create context from relevant chunks
            context = "\n\n".join([doc.page_content for doc in docs])

            # Generate answer using simple extraction
            sentences = []
            for doc in docs:
                doc_sentences = [s.strip() + "." for s in doc.page_content.split('.') if len(s.strip()) > 20]
                sentences.extend(doc_sentences)

            # Score sentences based on question
            question_words = set(word.lower() for word in question.split() if len(word) > 3)
            scored_sentences = []

            for sentence in sentences:
                sentence_words = set(word.lower() for word in sentence.split())
                overlap = len(question_words.intersection(sentence_words))
                if overlap > 0:
                    scored_sentences.append((overlap, sentence))

            # Create answer
            if scored_sentences:
                scored_sentences.sort(reverse=True)
                answer_sentences = [sent[1] for sent in scored_sentences[:3]]
                answer = " ".join(answer_sentences)
            else:
                answer = docs[0].page_content[:500] + "..."

            # Add sources
            answer += f"\n\n **Sources:** Found in {len(docs)} sections of your PDF"

            history.append([question, answer])
            return history, ""

        except Exception as e:
            history.append([question, f" Error: {str(e)}"])
            return history, ""

# Initialize system
pdf_system = PDFChatSystem()

# Create Gradio Interface
def clear_chat():
    return []

# Build interface
with gr.Blocks(title="PDF Q&A System", theme=gr.themes.Soft()) as app:
    gr.Markdown(
        """
        #  PDF Question & Answer System

        **Upload any PDF and ask questions about its content!**

        ### How to use:
        1.  Upload your PDF file
        2.  Wait for processing confirmation
        3.  Ask questions about the content
        4.  Get answers with source references
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            # PDF Upload Section
            gr.Markdown("### 📤 Upload PDF")
            pdf_input = gr.File(
                label="Choose PDF File",
                file_types=[".pdf"],
                file_count="single"
            )

            process_btn = gr.Button("Process PDF", variant="primary", size="lg")

            status_output = gr.Textbox(
                label=" Status",
                lines=4,
                interactive=False
            )

            gr.Markdown(
                """
                ### 💡 Tips:
                • Upload clear, text-based PDFs
                • Scanned documents work but may be less accurate
                • Processing may take a few moments for large files
                """
            )

        with gr.Column(scale=2):
            # Chat Section
            gr.Markdown("###  Ask Questions")

            chatbot = gr.Chatbot(
                label="Q&A Chat",
                height=400,
                show_label=True
            )

            with gr.Row():
                question_input = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask anything about your PDF content...",
                    lines=2,
                    scale=4
                )
                ask_btn = gr.Button(" Ask", variant="primary", scale=1)

            with gr.Row():
                clear_btn = gr.Button(" Clear Chat", variant="secondary")

            # Example questions
            gr.Markdown("###  Example Questions:")
            gr.Markdown(
                """
                • *"What is the main topic of this document?"*
                • *"Summarize the key points"*
                • *"What are the conclusions?"*
                • *"Explain [specific topic] mentioned in the PDF"*
                • *"What does the document say about [keyword]?"*
                """
            )

    # Event handlers (must be inside the Blocks context)
    process_btn.click(
        fn=pdf_system.process_pdf,
        inputs=[pdf_input],
        outputs=[status_output]
    )

    ask_btn.click(
        fn=pdf_system.answer_question,
        inputs=[question_input, chatbot],
        outputs=[chatbot, question_input]
    )

    question_input.submit(
        fn=pdf_system.answer_question,
        inputs=[question_input, chatbot],
        outputs=[chatbot, question_input]
    )

    clear_btn.click(
        fn=clear_chat,
        outputs=[chatbot]
    )

# Launch the app
print(" Launching PDF Q&A System...")
print(" The interface will open in a new window")
print(" A public link will be generated for sharing")

app.launch(
    share=True,
    debug=False,
    show_error=True,
    height=800
)

print(" PDF Q&A System is now running!")
print(" Upload a PDF file and start asking questions!")

🔧 Installing dependencies...
Setup complete!
Initializing PDF Chat System...
System ready!
 Launching PDF Q&A System...
 The interface will open in a new window
 A public link will be generated for sharing
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ab9cac7533a996f094.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


 PDF Q&A System is now running!
 Upload a PDF file and start asking questions!
