In [None]:
import gradio as gr
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import Ollama
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

class PDFProcessor:
    def __init__(self):
        self.vector_store = None
        self.text_chunks = []
        self.pdf_files = None  # To store uploaded PDF files

    def sanitize_text(self, text):
        return text.encode("utf-8", "replace").decode("utf-8")

    def get_pdf_text(self, pdf_docs):
        text = ""
        try:
            for pdf in pdf_docs:
                pdf_reader = PdfReader(pdf.name)
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += self.sanitize_text(page_text)
            if not text.strip():
                return "Error: No text found in PDFs."
            return text
        except Exception as e:
            return f"Error reading PDF: {str(e)}"

    def get_text_chunks(self, text):
        try:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
            chunks = text_splitter.split_text(text)
            if not chunks:
                return "Error: Failed to split text into chunks."
            self.text_chunks = chunks
            return chunks
        except Exception as e:
            return f"Error splitting text: {str(e)}"

    def create_vector_store(self, text_chunks):
        try:
            embedding_func = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
            self.vector_store = FAISS.from_texts(text_chunks, embedding=embedding_func)
            return "Vector store created successfully."
        except Exception as e:
            return f"Error creating vector store: {str(e)}"

    def get_conversational_chain(self):
        try:
            prompt_template = """
            Answer the question as detailed as possible based strictly on the content of the uploaded document(s).
            The answer should contain a minimum of 150 words if it is in the context.
            If the answer is not directly found in the document, respond with 'The answer is not in the provided context.'
            Context: {context}
            Question: {question}

            Answer:
            """
            llm = Ollama(model="llama3", temperature=0.5)
            prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
            chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
            return chain
        except Exception as e:
            return f"Error creating conversational chain: {str(e)}"

    def user_input(self, user_question):
        try:
            if self.vector_store is None:
                return "Error: No vector store found. Please upload and process PDFs first."

            docs = self.vector_store.similarity_search(user_question)
            if not docs:
                return "Error: No similar documents found."

            chain = self.get_conversational_chain()
            if isinstance(chain, str) and chain.startswith("Error"):
                return chain

            context = "\n".join([self.sanitize_text(doc.page_content) for doc in docs])
            response = chain({"input_documents": docs, "question": user_question})
            if not response or "output_text" not in response:
                return "Error: Failed to generate response."

            return response["output_text"]
        except Exception as e:
            return f"Error processing user input: {str(e)}"

    def process_pdfs(self, pdf_files):
        self.pdf_files = pdf_files
        combined_text = self.get_pdf_text(pdf_files)
        if isinstance(combined_text, str) and combined_text.startswith("Error"):
            return combined_text

        text_chunks = self.get_text_chunks(combined_text)
        if isinstance(text_chunks, str) and text_chunks.startswith("Error"):
            return text_chunks

        vector_store_status = self.create_vector_store(text_chunks)
        if isinstance(vector_store_status, str) and vector_store_status.startswith("Error"):
            return vector_store_status

        return "Processing completed."

# Create an instance of PDFProcessor
pdf_processor = PDFProcessor()

# Gradio interface for uploading and processing PDF files with a "Process" button
process_interface = gr.Interface(
    fn=pdf_processor.process_pdfs,
    inputs=gr.File(label="Upload PDF Files", file_count="multiple"),
    outputs="text",
    title="Process PDFs",
    description="Upload multiple PDF files, then click 'Process' to analyze."
)

# Interface for asking questions
question_interface = gr.Interface(
    fn=pdf_processor.user_input,
    inputs=gr.Textbox(label="Ask a Question"),
    outputs="text",
    title="Ask Questions",
    description="Ask questions based on the processed PDF files."
)

# Combining interfaces into a tabbed interface with only two tabs
app = gr.TabbedInterface(
    [process_interface, question_interface], 
    tab_names=["Process PDFs", "Ask Questions"]
)

app.launch()


  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




--------
  warn_deprecated(
  warn_deprecated(
