In [54]:
import json
import torch
import re
from PyPDF2 import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

# ------------------ 1 Extract Text from PDFs ------------------
def get_pdf_text(pdf_paths):
    """
    Extract all text from a list of PDF files and return as a single string.
    """
    text = ""
    for pdf in pdf_paths:
        reader = PdfReader(pdf)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            page_text = re.sub(r"[^\S\r\n]+", " ", page_text)  # replace spaces/tabs but keep \n
            text += page_text + " "
    return text.strip()

# Use raw string for Windows path
pdf_files = [r"NSBM_Details.pdf"]  
raw_text = get_pdf_text(pdf_files)

In [55]:
print(raw_text[:500])  # Print the first 500 characters of the extracted text

National School of Business Management Green University Town (NSBM Green 
University) 
The NSBM Academic Leadership Hierarchy is led by the Executive Management, which 
includes the Vice Chancellor, Prof. E. A. Weerasinghe, the Deputy Vice Chancellor, Prof. 
Chaminda Rathnayake, and the Head of Academic Development and Quality Assurance, Prof. 
Baratha Dodankotuwa. The Deans of the faculties are Ms. Thilini De Silva for the Faculty of 
Business, Dr. Rasika Ranaweera for the Faculty of Computing,


In [56]:
# ------------------ 2 Split Text and Create FAISS ------------------
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", " "],
    chunk_size=1600,
    chunk_overlap=250,
    length_function=len
)
text_chunks = text_splitter.split_text(raw_text)

print(f"Total chunks: {len(text_chunks)}")
print(text_chunks[0][:])  # preview first chunk

Total chunks: 20
National School of Business Management Green University Town (NSBM Green 
University) 
The NSBM Academic Leadership Hierarchy is led by the Executive Management, which 
includes the Vice Chancellor, Prof. E. A. Weerasinghe, the Deputy Vice Chancellor, Prof. 
Chaminda Rathnayake, and the Head of Academic Development and Quality Assurance, Prof. 
Baratha Dodankotuwa. The Deans of the faculties are Ms. Thilini De Silva for the Faculty of 
Business, Dr. Rasika Ranaweera for the Faculty of Computing, Dr. Chandana Perera for the 
Faculty of Engineering, Dr. Nuwanthi Katuwavila for the Faculty of Science , and Dr. Piyumi 
Udeshinee for the Faculty of Postgraduate Studies and Professional Advancement. 
 
Faculties and Departments 
NSBM comprises several key faculties, each housing specialized departments and programmes. 
The Faculty of Business is structured around the Department of Management, the Department of 
Accounting and Finance, and the Department of Legal Studies, off

In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": device}
)
vector_store = FAISS.from_texts(text_chunks, embeddings)

# Optionally save for later
vector_store.save_local("faiss_index")

In [58]:
# ------------------ 3 Initialize LLM ------------------
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# ------------------ 4 Generate Candidate Questions ------------------
question_prompt = f"""
Generate 20 concise factual questions based on the following text. 
Questions should be suitable for a PDF Q&A evaluation dataset.

Text:
{raw_text[:50000]}  # Limit to first 50000 chars
"""
questions_response = llm.invoke(question_prompt)
questions_text = questions_response.content.strip()

# Split into list and remove numbering/symbols
questions = [
    re.sub(r"^[0-9\.\)\-]+\s*", "", q).strip() 
    for q in questions_text.split("\n") if q.strip()
]

In [59]:
# Generated questions
for i, q in enumerate(questions):
    print(f"{i+1}. {q}")

1. Who is the Vice Chancellor of NSBM Green University?
2. What is the role of Prof. Baratha Dodankotuwa at NSBM?
3. Which faculty is Ms. Thilini De Silva the Dean of?
4. Name one programme offered by the Faculty of Computing.
5. What degree does the Department of Management offer in collaboration with Plymouth University?
6. Which department offers the Bachelor of Laws (Honours)?
7. What is the focus of the Faculty of Science at NSBM?
8. How many years is the BSc (Hons) in Data Science programme?
9. What is a key component of the BSc (Hons) in Data Science programme?
10. What are the entry qualifications for the BSc (Hons) in Computer Science?
11. What is the course fee per semester for the BSc (Hons) in Computer Science?
12. Which degree combines general business knowledge with specialized training in business analytics?
13. What is the total course fee for the BM (Hons) in Business Analytics over four years?
14. What type of training does the BBM (Hons) in Tourism, Hospitality and E

In [None]:
# ------------------ 5 Generate Reference Answers ------------------
eval_data = []
for question in questions:
    # Retrieve relevant context
    docs = vector_store.similarity_search(question, k=3)
    context_text = " ".join([doc.page_content for doc in docs])
    context_text = re.sub(r"\s+", " ", context_text).strip()  # Clean context
    

    # Generate answer
    answer_prompt = f"""
    Based on the following context, answer the question concisely.
    Provide only the answer.

    Context: {context_text}
    Question: {question}
    """
    response = llm.invoke(answer_prompt)
    gold_answer = response.content.strip()
    # Clean answer text
    gold_answer = re.sub(r"\s+", " ", gold_answer)
    gold_answer = re.sub(r"[^\w\s.,'-]", "", gold_answer)

    eval_data.append({
        "question": question,
        "answer": gold_answer,
        "context": context_text
    })


In [61]:
for doc in docs:
    print(doc.page_content)

University. Locally offered honours programmes inc lude the BSc Engineering Honours in 
Mechatronic Engineering, BSc Engineering Honours in Computer System Engineering, and BSc 
Engineering Honours in Electrical and Electronic Engineering. The faculty also offers design -
focused degrees such as the Bachelor o f Interior Design and the BA (Hons) in Interior Design. 
The Faculty of Science provides programmes across biomedical, pharmaceutical, health, and 
psychology fields. Its offerings include the BSc (Hons) in Biomedical Science, the BSc (Hons) in 
Pharmaceutical Science, and the BSc (Hons) in Nutrition and Health, a s well as a Foundation 
Programme for Bachelor’s Degree. In collaboration with Plymouth University, UK, the faculty 
delivers the BSc (Hons) Psychology, the BSc (Hons) Nursing, the BSc (Hons) Biomedical 
Science, and the BSc (Hons) Nursing Top -Up Degree. 
 
About BSC (HONOURS) IN DATA SCIENCE: 
The BSc (Hons) in Data Science, approved by the Ministry of Higher Educatio

In [63]:
# ------------------ 6 Save JSON ------------------
with open("auto_eval_data.json", "w", encoding="utf-8") as f:
    json.dump(eval_data, f, ensure_ascii=False, indent=4)

print("Automated evaluation JSON created: auto_eval_data.json")

Automated evaluation JSON created: auto_eval_data.json
