<a href="https://colab.research.google.com/github/imAdityaSatya/DocuChat/blob/main/DocuChat_(version_1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
# !pip install pypdf transformers torch
!pip install pypdf transformers torch sentence-transformers faiss-cpu
!pip install gradio
# !pip install nltk

In [None]:
import random
import gradio as gr
from pypdf import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import json

# --- Configuration ---
RESPONSES_FILE = "responses.json"  # Store responses externally
MAX_CONTEXT_LENGTH = 400
STRIDE = 50
CONFIDENCE_THRESHOLD = 0.25
MAX_HISTORY_LENGTH = 10  # Store up to 10 recent interactions
EMBEDDER_MODEL = "all-mpnet-base-v2"
QA_MODEL = "deepset/roberta-base-squad2"
QA_TOKENIZER = "deepset/roberta-base-squad2"

# --- Load Responses ---
def load_responses():
    if os.path.exists(RESPONSES_FILE):
        with open(RESPONSES_FILE, "r") as f:
            return json.load(f)
    else:
        return {
            "greetings_in": ["hi", "hello", "hey", "good morning", "good evening", "greetings"],
            "greetings_out": ["Hello!", "Hi there!", "Hey! How can I help?"],

            "intro_in": ["who are you", "what are you", "what is your name", "what's your name", "whats your name", "are you a bot", "are you a chatbot", "introduce yourself"],
            "intro_out": ["I am DocuChat, a QnA chatbot that answers questions from your PDF.", "I'm DocuChat, a chatbot that can answer questions from your PDF", "DocuChat here! You can chat with me based on any PDF you upload."],

            "ability_in": ["what can you do",  "how can you help me", "how can you help", "how can you assist me"],
            "ability_out": ["I can answer questions from your PDF. Ask me anything!", "I can answer your queries based on any PDF document uploaded by you"],

            "creator_in": ["who built you", "who created you", "who made you", "who's your creator", "who is your creator", "who developed you"],
            "creator_out": ["I was created by Aditya Satya.", "Aditya Satya has developed me"],

            "gratitude_in": ["good", "great", "awesome", "cool", "wow", "nice", "got it", "ok", "okay", "alright"],
            "gratitude_out": ["Happy to help! Anything else you wanna ask?", "Cool! Let me know if you've more questions."],

            "gratitude_in_2": ["thanks", "thank you", "thank you so much", "thanks a lot"],
            "gratitude_out_2": ["You're welcome! Let me know if you wanna ask anything else", "Oh, you're welcome! Is there any other question I can help you with?"],

            "greet_in": ["how are you", "how are you doing", "how are you feeling"],
            "greet_out": ["I'm doing great, thanks for asking. How may I help you?", "Oh, I'm fine. How can I assist you today?"],

            "sure_in": ["are you sure", "you sure", "really"],
            "sure_out": "Yes, to the best of my knowledge. \nHowever, I might be wrong sometimes. Please verify if it seems doubtful.",

            "wrong_in": ["incorrect", "wrong", "you're wrong", "you're incorrect", "you are wrong", "you are wrong"],
            "wrong_out": ["Oops! Well, I could be wrong sometimes. \nPlease verify if it seems incorrect.",
                          "I'm sorry if there was a mistake. \nKindly verify it once from your own end."],

            "exit_commands": ["exit", "stop", "quit", "bye", "goodbye", "see ya", "terminate", "no"],
            "empty_input_response": "Your input is empty. Please ask a question.",
            "default_response": "Sorry, I'm not really sure about that. Please rephrase your question or provide more context."
        }

responses = load_responses()

# --- PDF Handling ---
def extract_pdf_text(path: str) -> str:
    try:
        reader = PdfReader(path)
        return "\n".join(page.extract_text() or "" for page in reader.pages)
    except Exception as e:
        raise ValueError(f"Error extracting text from PDF: {e}")

def chunk_text(text: str) -> list:
    words = text.split()
    return [" ".join(words[i : i + MAX_CONTEXT_LENGTH]) for i in range(0, len(words), MAX_CONTEXT_LENGTH - STRIDE)]

# --- Initializing Models ---
embedder = SentenceTransformer(EMBEDDER_MODEL)
qa_pipeline = pipeline("question-answering", model=QA_MODEL, tokenizer=QA_TOKENIZER)

# --- Chatbot Logic ---
def get_bot_response(history, pdf_data):

    if not history or history[-1][1] is not None:     # Get the last user message from the history
        return history, pdf_data      # If the last message already has a bot reply, do nothing

    user_input = history[-1][0]   # Extract the user input from the last history entry
    cleaned_input = user_input.lower().strip().rstrip("?!")

    bot_reply = responses["default_response"]   # Default reply if no specific match or PDF data

    if cleaned_input in responses["exit_commands"]:
        bot_reply = "Okay then, Goodbye!"
    elif cleaned_input in responses["intro_in"]:
        bot_reply = random.choice(responses["intro_out"])
    elif cleaned_input in responses["ability_in"]:
        bot_reply = random.choice(responses["ability_out"])
    elif cleaned_input in responses["creator_in"]:
        bot_reply = random.choice(responses["creator_out"])
    elif cleaned_input in responses["gratitude_in"]:
        bot_reply = random.choice(responses["gratitude_out"])
    elif cleaned_input in responses["gratitude_in_2"]:
        bot_reply = random.choice(responses["gratitude_out_2"])
    elif cleaned_input in responses["greet_in"]:
        bot_reply = random.choice(responses["greet_out"])
    elif cleaned_input in responses["greetings_in"]:
        bot_reply = random.choice(responses["greetings_out"])
    elif cleaned_input in responses["sure_in"]:
        bot_reply = responses["sure_out"]
    elif cleaned_input in responses["wrong_in"]:
        bot_reply = random.choice(responses["wrong_out"])
    elif not cleaned_input:
        bot_reply = responses["empty_input_response"]
    else:
        # Handle pdf based questions
        try:
            if pdf_data is None:
                bot_reply = "Please upload a PDF first. Then we can chat based on that."
            else:
                contexts, faiss_index = pdf_data
                q_emb = embedder.encode([user_input], convert_to_numpy=True)
                faiss.normalize_L2(q_emb)
                D, I = faiss_index.search(q_emb, k=4)
                best = {"score": 0.0, "answer": responses["default_response"]}
                for idx in I[0]:
                    result = qa_pipeline(question=user_input, context=contexts[idx])
                    if result["score"] > best["score"]:
                        best = result
                # bot_reply = f"{best['answer']} (Confidence: {best['score']:.2f})" if best["score"] >= CONFIDENCE_THRESHOLD else best["answer"]
                bot_reply = best['answer'] if best['score'] >= CONFIDENCE_THRESHOLD else responses["default_response"]
        except Exception as e:
            print(f"Error: {e}")
            bot_reply = "I encountered an error. Please try again later."

    # Update the last entry in history with the bot's reply
    history[-1] = (history[-1][0], bot_reply)

    # Limit/Trim history length : Ensure history does not exceed MAX_HISTORY_LENGTH
    history = history[-MAX_HISTORY_LENGTH:]

    return history, pdf_data

# --- Gradio Interface ---
def upload_pdf(file):
    try:
        text = extract_pdf_text(file.name)
        contexts = chunk_text(text)
        faiss_index = build_index(contexts)
        return "PDF uploaded and processed.", (contexts, faiss_index)
    except ValueError as e:
        return str(e), None

def build_index(ctxs: list):
    embeddings = embedder.encode(ctxs, convert_to_numpy=True, show_progress_bar=False)
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    idx = faiss.IndexFlatIP(dim)
    idx.add(embeddings)
    return idx

with gr.Blocks(title="DocuChat") as demo:
    gr.Markdown("# 📄 DocuChat 🤖\nUpload a PDF and chat with it! 📚")
    with gr.Row():
        pdf_uploader = gr.File(label="Upload PDF", file_types=[".pdf"])
        # upload_btn = gr.Button("Process the file")

    status = gr.Textbox(label="Status", interactive=False, value="No file uploaded yet.")
    chatbot = gr.Chatbot(label="📄 DocuChat 🤖")

    with gr.Row(equal_height=True):
        msg = gr.Textbox(label="Your Message", placeholder="Ask me anything...", container=False)
        send = gr.Button("Send", scale= 0)  # Scale=0 to ensure an optimal send button width

    pdf_data = gr.State(None)  # Store PDF data

    pdf_uploader.upload(upload_pdf, inputs=pdf_uploader, outputs=[status, pdf_data])   # Process PDF on upload itself

    # msg.submit(get_bot_response, inputs=[msg, chatbot, pdf_data], outputs=[chatbot, pdf_data], queue=True).then(lambda: gr.update(value=''), outputs=msg)
    # send.click(get_bot_response, inputs=[msg, chatbot, pdf_data], outputs=[chatbot, pdf_data], queue=True).then(lambda: gr.update(value=''), outputs=msg)

    # The event listeners should now call get_bot_response with only chatbot and pdf_data
    def user_message_update(user_input, history):
        history = history or []
        history.append((user_input, None))  # Append user message with a placeholder for bot response
        return "", history

    msg.submit(
        user_message_update,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False   # We don't need to queue this immediate update
    ).then(
        get_bot_response,
        inputs=[chatbot, pdf_data],   # get_bot_response now takes the updated history
        outputs=[chatbot, pdf_data],
        queue=True
    )

    send.click(
        user_message_update,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot],
        queue=False
    ).then(
        get_bot_response,
        inputs=[chatbot, pdf_data],   # get_bot_response now takes the updated history
        outputs=[chatbot, pdf_data],
        queue=True
    )

    demo.launch(debug=True)
