In [1]:
from transformers import pipeline
import gradio as gr
import PyPDF2  # For PDF files
from docx import Document  # For Word files



In [2]:
# Initialize the QA & summarization pipelines
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

Device set to use mps:0
Device set to use mps:0


In [3]:
# Open file (PDF, Word, or text) and extract text
def extract_text_from_file(file_path):
    """Extract text from PDF, Word, or plain text files."""
    if file_path.endswith('.pdf'):
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
        return text
    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError("Unsupported file format. Please upload a PDF, Word (.docx), or plain text (.txt) file.")

In [4]:
# Function to handle the question-answering process
def process_document(file, question=None, summarize=False):
    """Process the uploaded file to answer a question and/or generate a summary."""
    if file is None:
        return "Error: No file uploaded."
    
    try:
        text = extract_text_from_file(file.name)
        output = ""
        
        # Answer the question if provided
        if question:
            result = question_answerer(question=question, context=text)
            output += (
                f"Question: {question}\n"
                f"Answer: {result['answer']}\n"
                f"Confidence Score: {result['score']:.4f}\n"
                f"Answer Location: (Start: {result['start']}, End: {result['end']})\n\n"
            )
        
        # Generate a summary if requested
        if summarize:
            summary = summarizer(text, 
                                max_length=300, 
                                min_length=100, 
                                do_sample=False)
            output += f"Summary:\n{summary[0]['summary_text']}\n"
        
        return output if output else "No action selected. Please ask a question or request a summary."
    
    except Exception as e:
        return f"Error processing the file: {str(e)}"

In [None]:
# Create the Gradio app
with gr.Blocks() as app:
    gr.Markdown("# Document Question Answering & Summarization")
    gr.Markdown("Upload a document (PDF, Word, or Text) and ask questions or request a summary.")
    gr.Markdown("### Note: This is a demo version. For production use, consider using a more robust solution.")
    
    with gr.Row():
        file_input = gr.File(label="Upload Document")
    
    with gr.Row():
        question_input = gr.Textbox(label="Ask a Question (Optional)")
    
    with gr.Row():
        summarize_checkbox = gr.Checkbox(label="Generate Summary")
    
    with gr.Row():
        submit_button = gr.Button("Submit")
    
    output_box = gr.Textbox(label="Output", lines=10, show_copy_button=True)
    
    submit_button.click(
        fn=process_document,
        inputs=[file_input, question_input, summarize_checkbox],
        outputs=output_box
    )

# Launch the app
#app.launch(show_error=True)
#app.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
IMPORTANT: You are using gradio version 3.50.2, however version 4.44.1 is available, please upgrade.
--------
Running on public URL: https://73a45bd120319a9fa7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


