In [1]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import os
import pandas as pd
import fitz  # PyMuPDF
import re
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
def clean_text(text: str) -> str:
    # --- Remove headers/footers ---
    text = re.sub(r"Reprint\s*20\d{2}-\d{2}", "", text)
    text = re.sub(r"CHAPTER\s*\d+", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Page\s*\d+", "", text, flags=re.IGNORECASE)
    
    # --- Remove figure captions and labels ---
    text = re.sub(r"Figure\s*\d+(\.\d+)*[^\n]*", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Types of [^\n]*\n?", "", text)  # e.g. "Types of aestivation..."
    text = re.sub(r"\bdiagram\b[^\n]*", "", text, flags=re.IGNORECASE)

    # --- Remove loose labels like (a), (b), (c), etc. ---
    text = re.sub(r"\([a-z]\)", "", text)
    text = re.sub(r"\([A-Z]\)", "", text)

    # --- Remove multiple spaces, newlines ---
    text = re.sub(r"\n+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    # --- Remove stray punctuation or hyphen artifacts ---
    text = text.replace("–", "-").replace("—", "-")
    text = re.sub(r"-\s+", "", text)  # join hyphenated words split across lines

    return text.strip()


pdf_folder = "ncert_book"

pdf_files = sorted([
    os.path.join(pdf_folder, f)
    for f in os.listdir(pdf_folder)
    if f.lower().endswith(".pdf")
])

print(f"📂 Found {len(pdf_files)} PDF files in '{pdf_folder}'")

all_docs = []

for pdf_path in pdf_files:
    print(f"📘 Extracting {pdf_path} with PyMuPDF...")

    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text("text", flags=1)
            cleaned = clean_text(text)
            if cleaned.strip():
                all_docs.append(Document(page_content=cleaned, metadata={"source": pdf_path}))

print(f"✅ Extraction complete — total pages processed: {len(all_docs)}")

print(f"\n✅ Total pages extracted: {len(all_docs)}")


📂 Found 32 PDF files in 'ncert_book'
📘 Extracting ncert_book\kebo101.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo102.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo103.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo104.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo105.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo106.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo107.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo108.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo109.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo110.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo111.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo112.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo113.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo114.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo115.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo116.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo117.pdf with PyMuPDF...
📘 Extracting ncert_book\kebo118.pdf with PyMuPDF...
📘 Extracting ncert_book\keb

In [None]:
len(all_docs)

1. **sentence‑transformers/all‑MiniLM‑L6‑v2**

   * Lightweight, good efficiency. For many semantic search tasks this is a strong baseline. ([suparva.com][2])
   * If your retrieval corpus is moderate size and you don’t have extremely heavy compute, this is a safe start.

2. **sentence‑transformers/all‑mpnet‑base‑v2**

   * Higher accuracy than MiniLM, somewhat heavier. Many practitioners pick this when quality is more important. (Mentioned in Reddit suggestions). ([Reddit][4])
   * Good for textbook/educational content where precision matters.

3. **intfloat/e5‑base** (or variants like e5-small/v2)

   * The blog “Top 10 embedding models you should know” lists E5 as strong for retrieval. ([linkedin.com][5])
   * Might be slightly heavier but worth if you want better retrieval.

4. **BAAI/bge‑base‑en‑v1.5**

   * Especially recommended in Reddit discussions for retrieval tasks. ([Reddit][6])
   * Could be a high‐quality choice if you have the compute and want best performance.

5. **Custom / domain fine-tuned embedding model**

   * If you find that no out-of-the-box model hits your retrieval accuracy / MCQ context well (because NCERT + NEET style is somewhat niche), you might consider fine-tuning one of the above on your domain (textbook passages + question/answer pairs) so embeddings align well with question retrieval.
   * For example you could take mpnet or e5 and fine‐tune via contrastive loss on NCERT passage ↔ MCQ answer pairs.


In [7]:
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base",
    encode_kwargs={"normalize_embeddings": True}
)

llm = ChatGroq(model_name="openai/gpt-oss-120B", temperature=0)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
output_file = "full_ncert.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n----------------------------------------\n".join([doc.page_content for doc in all_docs]))

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=80,
    separators=["\n\n", "\n", ".", "?", "!", " "]
)
chunks = text_splitter.split_documents(all_docs)

VECTOR_DB_PATH = "ncert-vector-store"

if os.path.exists(VECTOR_DB_PATH):
    print("🔁 Loading existing vectorstore...")
    vectorstore = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
    print("✅ Existing Vectorstore loaded from:", VECTOR_DB_PATH)
else:
    print("🧠 Creating new vectorstore...")
    vectorstore = FAISS.from_documents(chunks, embedding=embeddings)
    # vectorstore.save_local(VECTOR_DB_PATH)
    print("✅ Vectorstore saved at:", VECTOR_DB_PATH)

retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

prompt = ChatPromptTemplate.from_template("""
You are an NCERT-based NEET Biology assistant.
Use only the context provided below to answer the question.

Context:
{context}

Question:
{question}

Guidelines:
- Base your answer strictly on the context (ignore outside knowledge).
- If the context does not provide enough info, say:
  "The context does not provide this information."
- Answer clearly and concisely.
If it is an MCQ question it will have 4 options - A, B, C, D 
Answer just the option - like "B"
""")

🧠 Creating new vectorstore...
✅ Vectorstore saved at: ncert-vector-store


In [11]:
chain = prompt | llm

question = """
Which one of the following is the smallest living cell and lacks a true nucleus?

A. Fungi
B. Bacterium
C. Alga
D. Virus
"""

def get_answer(question):
    retrieved_docs = retriever.invoke(question)
    context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
    inputs = {
        "context": context_text,
        "question": question
    }
    response = chain.invoke(inputs)

    print("🧠 Question:", question)
    print("💬 Answer:", response.content)

    output_file = "retrieved_context_ncert.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n----------------------------------------\n".join([doc.page_content for doc in retrieved_docs]))

    return response.content


In [13]:
df = pd.read_csv("NCERT_Biology_Class11_NEET_MCQ.csv")

results = []

for i, row in df.iterrows():
    full_question = (
        f"{row['Question']}\n"
        f"A. {row['Option A']}\n"
        f"B. {row['Option B']}\n"
        f"C. {row['Option C']}\n"
        f"D. {row['Option D']}"
    )

    predicted = get_answer(full_question)
    predicted_option = predicted.strip().upper()[0] if predicted else ""
    correct = row["Correct Answer"].strip().upper()
    is_correct = predicted_option == correct

    results.append({
        "Question": row["Question"],
        "Predicted": predicted_option,
        "Correct": correct,
        "Result": "CORRECT" if is_correct else "FALSE"
    })

eval_df = pd.DataFrame(results)

accuracy = (eval_df["Result"] == "CORRECT").sum() / len(eval_df) * 100
print(f"\n📊 Evaluation complete! Accuracy: {accuracy:.2f}%\n")

# Optionally, save results
eval_df.to_csv("RAG_NCERT_Evaluation_Results.csv", index=False)
print("✅ Results saved to 'RAG_NCERT_Evaluation_Results.csv'")


🧠 Question: Which organelle is the primary site of ATP synthesis in eukaryotic cells?
A. Nucleus
B. Mitochondrion
C. Ribosome
D. Endoplasmic reticulum
💬 Answer: B
🧠 Question: Which macromolecule is primarily responsible for catalyzing biochemical reactions?
A. Carbohydrate
B. Lipid
C. Protein
D. Nucleic acid
💬 Answer: C
🧠 Question: The basic structural unit of a nucleic acid is called:
A. Amino acid
B. Nucleotide
C. Monosaccharide
D. Fatty acid
💬 Answer: B
🧠 Question: Which bond stabilizes the α-helix and β-sheet structures in proteins?
A. Peptide bond
B. Ionic bond
C. Disulfide bond
D. Hydrogen bond
💬 Answer: D
🧠 Question: Which organelle is involved in synthesis of lipids and detoxification in animal cells?
A. Rough endoplasmic reticulum
B. Golgi apparatus
C. Smooth endoplasmic reticulum
D. Lysosome
💬 Answer: C
🧠 Question: Which of the following is NOT a component of a typical eukaryotic cell membrane?
A. Phospholipids
B. Cholesterol
C. Peptidoglycan
D. Proteins
💬 Answer: C
🧠 Questio