In [None]:
import os
from PyPDF2 import PdfReader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from sentence_transformers import CrossEncoder
import torch #Transformers use torch tensors internally for computation.Even inference (via transformers, sentence-transformers, etc.) may use it.
from langchain_community.chat_models import ChatOllama
from transformers import AutoTokenizer
from langchain.schema.messages import HumanMessage, SystemMessage
from datetime import datetime
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# load pdf
def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Token Counter
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

def count_tokens(text):
    return len(tokenizer.encode(text))

# Dynamic token counter
def get_dynamic_max_tokens(task_type: str, prompt_text: str, model_limit: int = 8192) -> int:
    input_tokens = count_tokens(prompt_text)
    available_tokens = max(200, model_limit - input_tokens)

    if task_type == "qna":
        return min(1024, int(available_tokens * 0.25))
    elif task_type == "summary":
        return min(2048, int(available_tokens * 0.5))
    else:
        return min(1024, available_tokens)

# Text Splitter
def split_text(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    return splitter.create_documents([text])

# Vector Store (FAISS)
def create_vector_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedding=embeddings)
    return vectorstore


# Load Ollama LLM
def get_llm():
    return ChatOllama(
        model="mistral",
        temperature=0.7,
        top_k=40,
        top_p=0.9,        
        streaming=True  
        )


In [None]:
# Prompt Templates for Q&A
qa_prompt = PromptTemplate.from_template("""
You are an intelligent assistant that answers questions based on the context and prior chat history.
Don't repeat answers or fabricate information.

Answer as accurately and concisely as possible.
If the context does not contain the answer, just say: "I’m not sure based on the document."
Only use information from the context below.

Context:
{context}

Chat History:
{chat_history}

Question:
{question}

Answer:
""")


In [None]:
# Prompt Templates for Summary
summary_prompt = PromptTemplate.from_template("""
You are a summarization assistant. Your job is to read the following document and write a clear, concise summary.

- Focus on the key arguments, ideas, and important facts.
- Keep the summary short and easy to understand.
- Do not add anything that is not in the document.

Document:
{text}

Summary:
""")

In [None]:
# RAG Reranker with Cross-Encoder - Avoiding repeated or too-similar chunks and MMR is one of the method
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def retrieve_and_rerank(query, vectorstore, threshold=0.7, top_k=5):
    retriever = vectorstore.as_retriever(
        search_type="mmr", #Maximal Marginal Relevance
        search_kwargs={"k": top_k * 2, "lambda_mult": 0.5}  # λ = relevance vs diversity
    )
    mmr_docs = retriever.get_relevant_documents(query)

    # Filter based on score threshold
    scored_docs = vectorstore.similarity_search_with_score(query, k=top_k * 2)
    scored_dict = {doc.page_content: score for doc, score in scored_docs}
    filtered_docs = [doc for doc in mmr_docs if scored_dict.get(doc.page_content, 0) > threshold]

   
    if not filtered_docs:
        return []

    texts = [doc.page_content for doc in filtered_docs]
    pairs = [[query, text] for text in texts]
    rerank_scores = reranker.predict(pairs)

    reranked = sorted(zip(filtered_docs, rerank_scores), key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in reranked[:top_k]]


In [None]:
# Memory for storing chat history (used with RAG manually)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Final Pipeline (RAG + LLM + Memory + Chat Display)
def run_pipeline_with_memory(docs, vectorstore, llm):
    previous_answers = []
    chat_history_pairs = []
    MAX_HISTORY_TURNS = 5

    print("✅ Ready! Start chatting below.")

    while True:
        mode = input("\nChoose mode - 'qna' for Q&A, 'summary' for Summary, 'clear' to reset memory, 'show history' to display chat history, 'exit' to stop: ").strip().lower()

        if mode == "exit":
            break

        elif mode == "clear":
            memory.clear()
            previous_answers.clear()
            chat_history_pairs.clear()
            clear_output(wait=True)
            print("🧹 Memory and chat history cleared.")

        elif mode == "show history":
            if not chat_history_pairs:
                print("ℹ️ No chat history available yet.")
            else:
                print("\n🗃️ Chat History:")
                for i, (time, q, a) in enumerate(chat_history_pairs, 1):
                    print(f"{i}. 🕒 {time}")
                    print(f"   🧑 You: {q}")
                    print(f"   🤖 Bot: {a}")

        elif mode == "qna":
            query = input("\n🧠 Ask your question: ")
            relevant_docs = retrieve_and_rerank(query, vectorstore)
            context = "\n\n".join([doc.page_content for doc in relevant_docs])

            inputs = {
                "chat_history": memory.load_memory_variables({})["chat_history"],
                "context": context,
                "question": query
            }

            message = qa_prompt.format(**inputs)
            max_tokens = get_dynamic_max_tokens("qna", message)
            print(f"\n🕒 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            print("🧑 You:", query)
            print("🤖 ", end="", flush=True)

            answer = ""
            try:
                for chunk in llm.stream([HumanMessage(content=message)], max_tokens=max_tokens):
                    print(chunk.content, end="", flush=True)
                    answer += chunk.content
            except Exception as e:
                print("\n⚠️ Streaming failed, falling back...\n")
                try:
                    answer = llm.invoke([HumanMessage(content=message)], max_tokens=max_tokens).content
                    print(answer)
                except Exception as fallback_error:
                    print(f"❌ LLM failed: {fallback_error}")
                    continue  # Skip this round

            print()          

            if answer in previous_answers:
                print("🤖 I've already answered this or something very similar. Try a different question.")
            else:
                previous_answers.append(answer)
                memory.save_context({"question": query}, {"answer": answer})
                chat_history_pairs.append((datetime.now().strftime("%Y-%m-%d %H:%M:%S"), query, answer))

                # Keep only latest MAX_HISTORY_TURNS
                if len(chat_history_pairs) > MAX_HISTORY_TURNS:
                    chat_history_pairs = chat_history_pairs[-MAX_HISTORY_TURNS:]

        elif mode == "summary":
            docs_for_summary = retrieve_and_rerank("summarize the document", vectorstore)
            context = "\n\n".join([doc.page_content for doc in docs_for_summary])

            prompt = summary_prompt.format(text=context)
            max_tokens = get_dynamic_max_tokens("summary", prompt)

            print(f"\n🕒 {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
            print("📝 Summary:\n")

            summary = ""
            try:
                for chunk in llm.stream([HumanMessage(content=prompt)], max_tokens=max_tokens):
                    print(chunk.content, end="", flush=True)
                    summary += chunk.content
            except Exception as e:
                print("\n⚠️ Streaming failed, falling back...\n")
                try:
                    summary = llm.invoke([HumanMessage(content=prompt)], max_tokens=max_tokens).content
                    print(summary)
                except Exception as fallback_error:
                    print(f"❌ LLM failed: {fallback_error}")
                    continue  # Skip this round

            print()

        else:
            print("⚠️ Invalid mode. Please type 'qna', 'summary', 'show history', 'clear', or 'exit'.")


In [None]:
# Run full pipeline
pdf_path = input("Enter path to your PDF (e.g. docs/your_file.pdf): ")
if not os.path.exists(pdf_path):
    raise FileNotFoundError("❌ PDF not found.")
else:
    print("⏳ Loading and processing document...")
    text = load_pdf(pdf_path)
    total_tokens = count_tokens(text)
    print(f"📏 Total tokens in PDF: {total_tokens}")
    if total_tokens > 8000:
        print("⚠️ Warning: This exceeds the 8192 token limit of Mistral.")
        print("🔁 Only top relevant chunks will be used during Q&A or summary to stay within limits.")
        
    chunks = split_text(text)
    docs = [Document(page_content=chunk.page_content) for chunk in chunks]

    print("📦 Creating vector store...")
    vectorstore = create_vector_store(docs)

    print("🚀 Loading LLM...")
    llm = get_llm()

    run_pipeline_with_memory(docs, vectorstore, llm)


🧹 Memory and chat history cleared.
ℹ️ No chat history available yet.

🕒 2025-07-28 01:16:17
🧑 You: who is lila?
🤖  Lila is a 13-year-old girl who discovers an old, time-traveling notebook in her grandmother's attic. She uses the notebook to travel through different points in history and learn from historical events.
⚠️ Invalid mode. Please type 'qna', 'summary', 'show history', 'clear', or 'exit'.

🕒 2025-07-28 01:18:04
📝 Summary:

 A 13-year-old girl named Lila finds a magical time-traveling notebook in her grandmother's attic. The notebook transports her to various historical periods based on what she writes within it, such as medieval times, ancient Egypt, and the future year 3025. She learns that emotional connection to the event is crucial for successful time travel. During one instance, Lila gets stuck in 1942 and solves a historical mystery to return. Her experience teaches her valuable lessons about courage and trust. Upon returning to Eldridge, she decides to use the notebook 