In [None]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_ollama import ChatOllama
import os
import pandas as pd
from ncert_constants import NCERT_FILE, PAGE_DELIMITER
import fitz  # PyMuPDF
import re
from langchain_core.documents import Document

In [None]:
with open(NCERT_FILE, "r", encoding="utf-8") as f:
    content = f.read()

pages = content.split(PAGE_DELIMITER)
all_docs = []

for i, page in enumerate(pages, start=1):
    all_docs.append(Document(page_content=page))

In [56]:
# embeddings = OllamaEmbeddings(model="nomic-embed-text")
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base",
    encode_kwargs={"normalize_embeddings": True}
)
# embeddings = HuggingFaceEmbeddings(
#     model_name="sentence-transformers/all-MiniLM-L6-v2",
#     encode_kwargs={"normalize_embeddings": True}
# )

# llm = ChatGroq(model_name="openai/gpt-oss-20B", temperature=0)
llm = ChatGroq(model_name="moonshotai/kimi-k2-instruct-0905", temperature=0.1)
# llm = ChatOllama(model="gemma2:2b")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=80,
    separators=["\n\n", "\n", ".", "?", "!", " "]
)
chunks = text_splitter.split_documents(all_docs)

VECTOR_DB_PATH = "ncert-vector-store"

if os.path.exists(VECTOR_DB_PATH):
    print("üîÅ Loading existing vectorstore...")
    vectorstore = FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)
    print("‚úÖ Existing Vectorstore loaded from:", VECTOR_DB_PATH)
else:
    print("üß† Creating new vectorstore...")
    vectorstore = FAISS.from_documents(chunks, embedding=embeddings)
    # vectorstore.save_local(VECTOR_DB_PATH)
    print("‚úÖ Vectorstore saved at:", VECTOR_DB_PATH)

In [57]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 15})

prompt = ChatPromptTemplate.from_template("""
You are an NCERT-based NEET Biology assistant.
Use only the context provided below to answer the question.

Context:
{context}

Question:
{question}

Guidelines:
- Base your answer strictly on the context (ignore outside knowledge).
- If the context does not provide enough info, say:
  "The context does not provide this information."
- Answer clearly and concisely.
If it is an MCQ question it will have 4 options - A, B, C, D 
Answer just the option - like "B"
""")

In [58]:
chain = prompt | llm

question = """
Which one of the following is the smallest living cell and lacks a true nucleus?

A. Fungi
B. Bacterium
C. Alga
D. Virus
"""

def get_answer(question):
    retrieved_docs = retriever.invoke(question)
    context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
    inputs = {
        "context": context_text,
        "question": question
    }
    response = chain.invoke(inputs)

    # print("üß† Question:", question)
    # print("üí¨ Answer:", response.content)

    output_file = "retrieved_context_ncert.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n----------------------------------------\n".join([doc.page_content for doc in retrieved_docs]))

    return response.content


In [59]:
df = pd.read_csv("NCERT_Biology_Class11_NEET_MCQ.csv")

results = []

for i, row in df.iterrows():
    full_question = (
        f"{row['Question']}\n"
        f"A. {row['Option A']}\n"
        f"B. {row['Option B']}\n"
        f"C. {row['Option C']}\n"
        f"D. {row['Option D']}"
    )

    print(f"---- {i} ----")
    predicted = get_answer(full_question)
    predicted = re.sub(r"<think>.*?</think>", "", predicted, flags=re.DOTALL).strip()
    predicted_option = predicted.strip().upper()[0] if predicted else ""
    correct = row["Correct Answer"].strip().upper()
    is_correct = predicted_option == correct
    print("CORRECT" if is_correct else "WRONG")

    results.append({
        "Question": row["Question"],
        "Predicted": predicted_option,
        "Correct": correct,
        "Result": "CORRECT" if is_correct else "FALSE"
    })

eval_df = pd.DataFrame(results)

accuracy = (eval_df["Result"] == "CORRECT").sum() / len(eval_df) * 100
print(f"\nüìä Evaluation complete! Accuracy: {accuracy:.2f}%\n")

# Optionally, save results
eval_df.to_csv("RAG_NCERT_Evaluation_Results.csv", index=False)
print("‚úÖ Results saved to 'RAG_NCERT_Evaluation_Results.csv'")


---- 0 ----
CORRECT
---- 1 ----
CORRECT
---- 2 ----
CORRECT
---- 3 ----
CORRECT
---- 4 ----
CORRECT
---- 5 ----
CORRECT
---- 6 ----
CORRECT
---- 7 ----
CORRECT
---- 8 ----
CORRECT
---- 9 ----
CORRECT
---- 10 ----
WRONG
---- 11 ----
CORRECT
---- 12 ----
CORRECT
---- 13 ----
CORRECT
---- 14 ----
CORRECT
---- 15 ----
WRONG
---- 16 ----
CORRECT
---- 17 ----
CORRECT
---- 18 ----
CORRECT
---- 19 ----
CORRECT
---- 20 ----
CORRECT
---- 21 ----
CORRECT
---- 22 ----
CORRECT
---- 23 ----
CORRECT
---- 24 ----
CORRECT
---- 25 ----
CORRECT
---- 26 ----
CORRECT
---- 27 ----
CORRECT
---- 28 ----
CORRECT
---- 29 ----
CORRECT
---- 30 ----
CORRECT
---- 31 ----
CORRECT
---- 32 ----
WRONG
---- 33 ----
CORRECT
---- 34 ----
CORRECT
---- 35 ----
CORRECT
---- 36 ----
CORRECT
---- 37 ----
CORRECT
---- 38 ----
CORRECT
---- 39 ----
CORRECT
---- 40 ----
CORRECT
---- 41 ----
CORRECT
---- 42 ----
CORRECT
---- 43 ----
CORRECT
---- 44 ----
CORRECT
---- 45 ----
CORRECT
---- 46 ----
CORRECT
---- 47 ----
WRONG
---- 48 --