<a href="https://colab.research.google.com/github/imAdityaSatya/DocuChat/blob/main/DocuChat_(version_0).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**DocuChat** : QnA ChatBot version-0

In [None]:
# Install dependencies
# !pip install pypdf transformers torch
!pip install pypdf transformers torch sentence-transformers faiss-cpu

In [None]:
import random
from pypdf import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from google.colab import files

# Greeting detection setup
GREETINGS_IN = {"hi", "hello", "hey", "good morning", "good evening", "greetings"}
GREETINGS_OUT = ["Hello!", "Hi there!", "Hey! How can I help?"]

def check_greeting(text: str) -> str:
    return random.choice(GREETINGS_OUT) if text.lower() in GREETINGS_IN else ""


In [None]:
# Functions for PDF Text Extraction & Chunking

def extract_pdf_text(path: str) -> str:
    reader = PdfReader(path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def chunk_text(text: str, max_len: int = 400, stride: int = 50) -> list[str]:
    words = text.split()
    return [
        " ".join(words[i : i + max_len])
        for i in range(0, len(words), max_len - stride)
    ]

# Upload PDF file
uploaded = files.upload()
pdf_path = next(iter(uploaded.keys()))

# Extract and chunk text
full_text = extract_pdf_text(pdf_path)
contexts = chunk_text(full_text)
print(f"Extracted {len(contexts)} chunks from your PDF")

In [None]:
# Load the embedding model
embedder = SentenceTransformer("all-mpnet-base-v2")

# Embed the contexts
embeddings = embedder.encode(contexts, convert_to_numpy=True, show_progress_bar=True)

# Normalize and Build FAISS Index for Retrieval
faiss.normalize_L2(embeddings)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
print("FAISS index built successfully")


# Initialize the QnA Pipeline
qa_pipeline = pipeline(
    "question-answering",
    model="deepset/roberta-base-squad2",
    tokenizer="deepset/roberta-base-squad2"
)
print("QnA Pipeline Initialized")

In [None]:
# Interactive Chatbot
print("📄DocuChat🤖 is ready! \nType in your question and hit Enter\tType 'exit' or 'quit' to stop\n")

# Some querry triggers
INTRO_QUERIES = {"who are you", "what's your name", "whats your name", "what can you do", "are you a bot", "are you a chatbot", "introduce yourself"}
CREATOR_QUERIES = {"who built you", "who created you", "who made you", "who's your creator", "who is your creator"}
GRATITUDE_QUERIES = {"thanks", "thank you", "good", "great", "wow", "awesome", "cool", "nice", "got it", "ok", "okay"}
GREET_QUERIES = {"how are you", "how are you doing", "how are you feeling"}

while True:
    user_input = input("You: ").strip()
    # Cleaned user input : Convert to lowercase and ignore special chars (?, !)
    cleaned_input = user_input.lower().replace("?", "").replace("!", "")

    # Exit Commands
    if cleaned_input in {"exit", "quit", "bye", "goodbye"}:
        print("Bot: Goodbye!")
        break

    # Intro query handling
    if cleaned_input in INTRO_QUERIES:
        print("Bot: I am DocuChat, a QnA chatbot that can answer questions based on any PDF you upload.")
        continue

    # Creator query handling
    if cleaned_input in CREATOR_QUERIES:
        print("Bot: I was created by Aditya Satya :) ")
        continue

    # Gratitude query handling
    if cleaned_input in GRATITUDE_QUERIES:
        print("Bot: Happy to help! \n     Let me know if there's anything else I can help you with. \n")
        continue

    # Greet query handling
    if cleaned_input in GREET_QUERIES:
        print("Bot: I'm doing great, thanks for asking. \n     How may I help you?\n")
        continue

    # Handle greetings
    greet = check_greeting(cleaned_input)
    if greet:
        print(f"Bot: {greet}")
        continue

    # Handle the empty input
    if not user_input:
        print("Bot: Your input is empty. Please ask a question.")
        continue


    # Encode query and retrieve top-k contexts
    q_emb = embedder.encode(user_input, convert_to_numpy=True)
    faiss.normalize_L2(q_emb.reshape(1, -1))
    D, I = index.search(q_emb.reshape(1, -1), k=3)  # retrieve top-3

    # Run QA on retrieved chunks
    best = {"score": 0.0, "answer": "Sorry, this is beyond the scope of abilities."}
    for idx in I[0]:
        result = qa_pipeline(question=user_input, context=contexts[idx])
        if result["score"] > best["score"]:
            best = result

    # Confidence threshold
    if best["score"] < 0.25:
        print("Bot: Sorry, I'm not really sure about that.")
    else:
        print(f"Bot: {best['answer']}")
