In [20]:
import json
import torch
import re
import numpy as np
from sentence_transformers import SentenceTransformer, util
from langchain.chat_models import ChatOpenAI
from PyPDF2 import PdfReader
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from word2number import w2n

# ------------------ 1 NLTK Setup ------------------
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ISURU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ISURU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ISURU\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
# ------------------ 2 Load JSON containing questions and RAG answers ------------------
with open("auto_eval_data.json", "r", encoding="utf-8") as f:
    eval_data = json.load(f)
print(f"Loaded {len(eval_data)} questions with RAG answers.")

# ------------------ 3 Extract full text from PDFs ------------------
pdf_files = [r"NSBM_Details.pdf"]

def get_pdf_text(pdf_paths):
    text = ""
    for pdf in pdf_paths:
        reader = PdfReader(pdf)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + " "
    return re.sub(r"\s+", " ", text).strip()

full_pdf_text = get_pdf_text(pdf_files)
print("PDF content loaded.")

Loaded 20 questions with RAG answers.
PDF content loaded.


In [22]:
# ------------------ 4 Initialize LLM ------------------
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# ------------------ 5 Metric functions using NLTK ------------------
def normalize_text_nltk(text):
    text = text.lower()
    text = re.sub(r"[-]", " ", text)  # replace hyphens with space
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation

    tokens = nltk.word_tokenize(text)
    normalized_tokens = []
    for token in tokens:
        if token in stop_words:
            continue
        try:
            num = w2n.word_to_num(token)
            normalized_tokens.append(str(num))
        except:
            normalized_tokens.append(lemmatizer.lemmatize(token))
    return normalized_tokens

def compute_f1_nltk(pred, truth):
    pred_tokens = normalize_text_nltk(pred)
    truth_tokens = normalize_text_nltk(truth)
    if not pred_tokens or not truth_tokens:
        return 0.0
    pred_count = Counter(pred_tokens)
    truth_count = Counter(truth_tokens)
    common = sum((pred_count & truth_count).values())
    precision = common / sum(pred_count.values())
    recall = common / sum(truth_count.values())
    return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

# Semantic similarity using SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")
def compute_semantic_similarity(pred, truth):
    emb1 = embedder.encode(pred, convert_to_tensor=True)
    emb2 = embedder.encode(truth, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()


In [23]:
# ------------------ 6 Generate Actual Answers using full PDF ------------------
actual_answers = []

for sample in eval_data:
    question = sample["question"]
    prompt = f"""
You are a helpful assistant for extracting PDF content.
Answer the question using only the provided PDF context.
Provide only the answer concisely, without repeating the question.
Example: What degree does the Department of Management offer in collaboration with Plymouth University?
RAG Answer: BSc Hons in International Management and Business.

PDF Text: {full_pdf_text}  
Question: {question}
"""
    response = llm.invoke(prompt)
    actual_answer = response.content.strip()
    actual_answers.append(actual_answer)

    print(f"\nQ: {question}")
    print(f"Actual Answer: {actual_answer}")
    print("-" * 60)



Q: Who is the Vice Chancellor of NSBM Green University?
Actual Answer: Prof. E. A. Weerasinghe.
------------------------------------------------------------

Q: What is the role of Prof. Baratha Dodankotuwa at NSBM?
Actual Answer: Head of Academic Development and Quality Assurance.
------------------------------------------------------------

Q: Which faculty is Ms. Thilini De Silva the Dean of?
Actual Answer: Faculty of Business.
------------------------------------------------------------

Q: Name one programme offered by the Faculty of Computing.
Actual Answer: BSc (Hons) in Data Science.
------------------------------------------------------------

Q: What degree does the Department of Management offer in collaboration with Plymouth University?
Actual Answer: BSc (Hons) in International Management and Business.
------------------------------------------------------------

Q: Which department offers the Bachelor of Laws (Honours)?
Actual Answer: Department of Legal Studies.
-------

In [24]:
# ------------------ 7 Evaluate RAG answers against actual answers ------------------
f1_scores = []
semantic_sims = []

for idx, sample in enumerate(eval_data):
    rag_answer = sample["answer"]  # RAG-generated answer
    actual_answer = actual_answers[idx]

    f1 = compute_f1_nltk(rag_answer, actual_answer)
    sim = compute_semantic_similarity(rag_answer, actual_answer)

    f1_scores.append(f1)
    semantic_sims.append(sim)

    print(f"\nQ: {sample['question']}")
    print(f"RAG Answer: {rag_answer}")
    print(f"Actual Answer: {actual_answer}")
    print(f"Token F1: {f1:.2f}, Semantic Similarity: {sim:.2f}")
    print("-" * 60)


Q: Who is the Vice Chancellor of NSBM Green University?
RAG Answer: Prof. E. A. Weerasinghe
Actual Answer: Prof. E. A. Weerasinghe.
Token F1: 1.00, Semantic Similarity: 0.99
------------------------------------------------------------

Q: What is the role of Prof. Baratha Dodankotuwa at NSBM?
RAG Answer: Head of Academic Development and Quality Assurance.
Actual Answer: Head of Academic Development and Quality Assurance.
Token F1: 1.00, Semantic Similarity: 1.00
------------------------------------------------------------

Q: Which faculty is Ms. Thilini De Silva the Dean of?
RAG Answer: Faculty of Business
Actual Answer: Faculty of Business.
Token F1: 1.00, Semantic Similarity: 0.98
------------------------------------------------------------

Q: Name one programme offered by the Faculty of Computing.
RAG Answer: BSc Hons in Computer Science
Actual Answer: BSc (Hons) in Data Science.
Token F1: 0.75, Semantic Similarity: 0.82
-----------------------------------------------------------

In [25]:
# ------------------ 8 Summary Metrics ------------------
print("\n==== Overall Evaluation ====")
print(f"Average Token F1: {np.mean(f1_scores):.2f}")
print(f"Average Semantic Similarity: {np.mean(semantic_sims):.2f}")


==== Overall Evaluation ====
Average Token F1: 0.90
Average Semantic Similarity: 0.91
