## Context Tools

In [None]:
from __future__ import annotations
from typing import List, Optional
import json
import re
from dataclasses import dataclass
from pathlib import Path

from langchain_core.documents import Document
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from pypdf import PdfReader


In [52]:
root = Path.cwd()
data_path= root / "data"

curriculum = data_path / "dsa_degree_requirements.pdf"
chs_requirements = data_path / "dsa_chs_requirements.json"

### Document Creating

Firstly, we create Document types with information from the PDF and JSON files in data.

We first create a function to read the curriculum PDF

In [53]:
def read_pdf(path):
    reader = PdfReader(str(path))
    docs = []
    # iterate through pages
    for index, page in enumerate(reader.pages):
        # extract text from page
        text = page.extract_text()
        if not text:
            continue
        # docs append the page
        docs.append(
            # Document type from langchain
            Document(
                # context is text
                page_content=text,
                # metadata
                metadata={
                    "source": str(path.relative_to(root)),
                    "page": index,
                    "category": "curriculum",
                },
            )
        )
    return docs

print(read_pdf(curriculum)[0])


page_content='Requirements for Major in Data Science and Analytics
Applicable to cohorts AY2021/2022 and after
Levels Major Requirements Cumulative 
Major Units
Pass
-  DSA1101 Introduction to Data Science1
Pass
-  CS2040 Data Structures and Algorithms2
-  DSA2101 Essential Data Analytics Tools: Data Visualisation
-  DSA2102 Essential Data Analytics Tools: Numerical Computation
-  MA2001 Linear Algebra I
-  MA2002 Calculus
-  MA2311 Techniques in Advanced Calculus
 or MA2104 Multivariable Calculus
-  ST2131/MA2116/MA2216 Probability
-  ST2132 Mathematical Statistics
Pass
-  CS3244 Machine Learning
-  DSA3101 Data Science in Practice
-  DSA3102 Essential Data Analytics Tools: Convex Optimisation
-  ST3131 Regression Analysis
 Choose either Option A or Option B
 Option A – Pass two courses as follows:
 - One course from DSA42xx courses (except DSA4288 / DSA4288M / 
DSA4288S)
    or DSE4211  / QF4211 Digital Currencies
    or DSE4212 / QF4212 Data Science in FinTech
 - One other course fr

We load the chs requirements json file and convert the nested structureinto multiple documents.

In [54]:
# to get course description
def course_desc(course, parent = None):
    # if course is a string, return it directly
    if isinstance(course, str):
        return course
    # code of course
    code = course.get("code", "N/A")
    # title of course
    title = course.get("title", "N/A")
    # any footnotes
    note = course.get("note", "") or course.get("footnote") or parent.get("note") if parent else None
    prefix = ""
    # if parent has category, add it to prefix
    if parent and "category" in parent:
        # add the category to the prefix
        prefix = f"[{parent['category']}] - "
    if parent and parent.get("gen-coded") is False:
        prefix = f"{prefix}Non-GEN option - "
    if parent and parent.get("gen-coded") is True:
        prefix = f"{prefix}GEN option - "
    text = f"{prefix}{code}"
    if title:
        text += f": {title}"
    if note:
        text += f" ({note})"
    return text

# iterate through options: nested --> flat
"""
options examples:
{'category': 'Service-Learning C&E courses', 'subcategories': [{'semester': 1, 'courses': [{'code': 'GEN2050X', 'title': 'Teach SG'}, {'code': 'GEN2060X', 'title': 'Reconnect SeniorsSG'}, {'code': 'GEN2061X', 'title': 'Support Healthy AgeingSG'}, {'code': 'GEN2062X', 'title': 'Community Activities for Seniors with SG Cares'}, {'code': 'GEN2070X', 'title': 'Community Link (Comlink) Befrienders'}]}, {'semester': 2, 'courses': [{'code': 'GEN2050Y', 'title': 'Teach SG'}, {'code': 'GEN2060Y', 'title': 'Reconnect SeniorsSG'}, {'code': 'GEN2061Y', 'title': 'Support Healthy AgeingSG'}, {'code': 'GEN2062Y', 'title': 'Community Activities for Seniors with SG Cares'}, {'code': 'GEN2070Y', 'title': 'Community Link (Comlink) Befrienders'}]}]}
{'code': 'HS1501'}
{'code': 'CS2109S', 'footnote': '#'}
"""
def iter_options(options):
    for option in options:
        # if option is a string
        if isinstance(option, str):
            # yield: lets a function hand back one value at a time and pause between results so you can iterate lazily without building the entire list up front 
            yield option
        # if option has code
        elif "code" in option:
            yield course_desc(option)
        # if option has options
        elif "courses" in option:
            for course in option["courses"]:
                yield course_desc(course, parent=option)
        # if option has subcategories
        elif "subcategories" in option:
            for sub in option["subcategories"]:
                for course in sub.get("courses", []):
                    yield course_desc(course, parent=sub)
        else: 
            yield str(option)

In [55]:
def chs_summary():
    data = json.loads(chs_requirements.read_text())

    # return the cohort of chs_summary
    cohort = data.get("cohort", "none")

    # return the structure of chs_common_curriculum
    structure = data.get("chs_common_curriculum", {}).get("structure")

    # summary of the number of courses in each category
    structure_summary = (
        f"Total CHS common curriculum courses: {structure.get('total_courses', 'N/A')} "
        f"(common core: {structure.get('common_core_courses', 'N/A')}, "
        f"integrated {structure.get('integrated_courses', 'n/a')}, "
        f"interdisciplinary {structure.get('interdisciplinary_courses', 'n/a')})."
    )

    # start the document list, first one is the summary of all the chs requirements
    docs = [
        Document(
            page_content = f"College of Humanities and Sciences common curriculum summary "
            f"for {cohort}. {structure_summary}",

            metadata={
                "source": str(chs_requirements.relative_to(root)),
                "section": "summary",
                "category": "chs_common_curriculum",
            },
        )
    ]

    # get the chs common curriculum details
    curriculum = data.get("chs_common_curriculum", {})

    # docs for each pillar
    # section: what type of course: common core, integrated, interdisciplinary
    # entries: courses in that section
    def pillar_docs(section, entries):
        for entry in entries:
            # pillar example: {'pillar': 'Design Thinking', 'course_options': [{'code': 'DTK1234'}]}
            pillar = entry.get("pillar")
            options = entry.get("course_options", [])
            lines = [f"{course_desc(opt)}" for opt in iter_options(options)]
            text = (
                f"{section} pillar: {pillar}. "
                f"Students may choose from the following options:\n"
                + "\n".join(lines)
            )
            docs.append(
                Document(
                    page_content=text,
                    metadata={
                        "source": str(chs_requirements.relative_to(root)),
                        "section": section,
                        "pillar": pillar,
                        "category": "chs_common_curriculum"
                    },
                )
            )

    pillar_docs("Common core", curriculum.get("common_core", []) or [])
    pillar_docs("Integrated courses", curriculum.get("integrated_courses", []) or [])
    pillar_docs("Interdisciplinary courses", curriculum.get("interdisciplinary_courses", []) or [])

    # get the years to take some courses, e.g.: GEN
    timing = curriculum.get("timing_guidance", {})

    if timing:
        timing_lines = []
        for name, info in timing.items():
            # join the years list into a string
            years = ", ".join(str(y) for y in info.get("recommended_years", []))
            # note shows redundant info, so we omit it
            # Communities And Engagement: years 2, 3, 4.
            timing_lines.append(f"{name.replace('_', ' ').title()}: years {years}.")
            docs.append(
                Document(
                    page_content="Timing guidance for CHS curriculum:\n" + "\n".join(timing_lines),
                    metadata={
                        "source": str(chs_requirements.relative_to(root)),
                        "section": "timing_guidance",
                        "category": "chs_common_curriculum",
                    },
                )
            )

    # get year 1 preallocation info
    preallocation = curriculum.get("year1_preallocation", {})

    if preallocation:
        lines = []
        for cohort, info in preallocation.items():
            # sem 1 info
            s1 = ", ".join(info.get("semester_1", []))
            # sem 2 info
            s2 = ", ".join(info.get("semester_2", []))
            # append the line
            lines.append(
                f"{cohort.replace('_', ' ')}: Semester 1 -> {s1 or 'n/a'}; "
                f"Semester 2 -> {s2 or 'n/a'}."
            )
        docs.append(
            Document(
                page_content="Year 1 preallocation details:\n" + "\n".join(lines),
                metadata={
                    "source": str(chs_requirements.relative_to(root)),
                    "section": "preallocation",
                    "category": "chs_common_curriculum",
                },
            )
        )     
    return docs

chs_summary()

[Document(metadata={'source': 'data/dsa_chs_requirements.json', 'section': 'summary', 'category': 'chs_common_curriculum'}, page_content='College of Humanities and Sciences common curriculum summary for AY2021/22 onwards. Total CHS common curriculum courses: 13 (common core: 6, integrated 5, interdisciplinary 2).'),
 Document(metadata={'source': 'data/dsa_chs_requirements.json', 'section': 'Common core', 'pillar': 'Data Literacy', 'category': 'chs_common_curriculum'}, page_content='Common core pillar: Data Literacy. Students may choose from the following options:\nGEA1000: N/A\nDSA1101: N/A\nST1131: N/A\nDSE1101: N/A\nBT1101: N/A'),
 Document(metadata={'source': 'data/dsa_chs_requirements.json', 'section': 'Common core', 'pillar': 'Communities and Engagement', 'category': 'chs_common_curriculum'}, page_content='Common core pillar: Communities and Engagement. Students may choose from the following options:\nNon-GEN option - BN4102: Gerontechnology in Ageing\nNon-GEN option - BN4103: Ass

We create a function to get all the documents.

In [56]:
def load_curriculum_docs():
    docs = read_pdf(curriculum)
    docs.extend(chs_summary())
    return docs

load_curriculum_docs()

[Document(metadata={'source': 'data/dsa_degree_requirements.pdf', 'page': 0, 'category': 'curriculum'}, page_content='Requirements for Major in Data Science and Analytics\nApplicable to cohorts AY2021/2022 and after\nLevels Major Requirements Cumulative \nMajor Units\nPass\n-  DSA1101 Introduction to Data Science1\nPass\n-  CS2040 Data Structures and Algorithms2\n-  DSA2101 Essential Data Analytics Tools: Data Visualisation\n-  DSA2102 Essential Data Analytics Tools: Numerical Computation\n-  MA2001 Linear Algebra I\n-  MA2002 Calculus\n-  MA2311 Techniques in Advanced Calculus\n or MA2104 Multivariable Calculus\n-  ST2131/MA2116/MA2216 Probability\n-  ST2132 Mathematical Statistics\nPass\n-  CS3244 Machine Learning\n-  DSA3101 Data Science in Practice\n-  DSA3102 Essential Data Analytics Tools: Convex Optimisation\n-  ST3131 Regression Analysis\n Choose either Option A or Option B\n Option A – Pass two courses as follows:\n - One course from DSA42xx courses (except DSA4288 / DSA4288M 

Function to get all valid mod code from docs.

In [57]:

# regex: \b: word boundary, [A-Z]{2,4}: 2-4 uppercase letters, \d{2,4}: 2-4 digits, \d{2,4}(?:[A-Z]{1,3}|X{1,3}|[a-z]{1,3})?\b: 1-3 uppercase letters, 1-3 lowercase letters, or 1-3 Xs
module_code = re.compile(r"\b[A-Z]{2,4}\d{2,4}(?:[A-Z]{1,3}|X{1,3}|[a-z]{1,3})?\b")

def valid_mods():
    docs = load_curriculum_docs()
    module_codes = {
        code.upper()
        for doc in docs
        for code in module_code.findall(doc.page_content)
    }

    with chs_requirements.open("r") as fh:
        # load the json data from chs
        chs_data = json.load(fh)

        # go through everything recursively to find all module codes
        def collect_codes(node):
            if isinstance(node, dict):
                if "code" in node:
                    module_codes.add(node["code"].upper())
                for value in node.values():
                    collect_codes(value)
            elif isinstance(node, list):
                for item in node:
                    collect_codes(item)

        collect_codes(chs_data)

        # sort and filter module codes with length >= 5
        filtered = sorted({code for code in module_codes if len(code) >= 5})

        content = (
            "Modules recognised by the planning assistant (from core DSA "
            "requirements, CHS common curriculum, and specialisation options):\n"
            + "\n".join(filtered)
        )

    return [
        Document (
            page_content=content,
            metadata={
                "source": "generated_valid_modules",
                "category": "valid_modules",
            },
        )
    ]
    
valid_mods()

[Document(metadata={'source': 'generated_valid_modules', 'category': 'valid_modules'}, page_content='Modules recognised by the planning assistant (from core DSA requirements, CHS common curriculum, and specialisation options):\nAY2021\nBN4102\nBN4103\nBT1101\nBZ2001\nCDE2001\nCLC1101\nCLC2204\nCLC3303\nCLC3304A\nCLC3307\nCM3267\nCOS1000\nCS1010S\nCS1010X\nCS1010X / CS1101S\nCS1101S\nCS2040\nCS2109S\nCS3244\nDSA1101\nDSA2101\nDSA2102\nDSA3101\nDSA3102\nDSA3288\nDSA3288R\nDSA4212\nDSA426X\nDSA4288\nDSA4288M\nDSA4288S\nDSA4288X\nDSA42XX\nDSE1101\nDSE4211\nDSE4212\nDTK1234\nFAS1101\nGEA1000\nGEI1001\nGEI1002\nGEN2000\nGEN2001\nGEN2002X\nGEN2002Y\nGEN2003\nGEN2004\nGEN2005\nGEN2006\nGEN2007\nGEN2008\nGEN2009\nGEN2050X\nGEN2050Y\nGEN2060X\nGEN2060Y\nGEN2061X\nGEN2061Y\nGEN2062X\nGEN2062Y\nGEN2070X\nGEN2070Y\nHS1501\nHS1502\nHS1503\nHS29XX\nHSA1000\nHSH1000\nHSI1000\nHSI20XX\nHSS1000\nIT1244\nLSM2302\nLSM2302 / BZ2001\nMA2001\nMA2002\nMA2104\nMA2116\nMA2216\nMA2311\nMA3227\nMA3238\nMA3252\nMA

### API Schema Document

Create a helper that returns the NUSMods module schema guide.

In [58]:
def load_api_schema_docs():
  # this just gives the info needed to understand the api to help the model
    schema_text = """NUSMods module API schema essentials:

Endpoint: https://api.nusmods.com/v2/<acadYear>/modules/<moduleCode>.json

Fields:
- moduleCode, title, description, moduleCredit, faculty, department
- prerequisite, preclusion, corequisite, prerequisiteTree, fulfillRequirements
- semesterData: per-semester offerings with timetable + exam info

SemesterData entry includes:
- semester (int) and optional examDate (ISO string)
- timetable blocks (classNo, lessonType, day, startTime, endTime, weeks, venue)

Interpretation tips:
- Use semesterData to see when a module runs and lesson types offered
- Combine prerequisite + prerequisiteTree for eligibility checks
- fulfillRequirements shows downstream modules unlocked after passing
- Empty semesterData means the module is not offered that academic year

Example response for module DSA4213:
{
  "moduleCode": "DSA4213",
  "moduleCredit": "4",
  "semesterData": [
    {
      "semester": 1,
      "timetable": [
        {
          "classNo": "1",
          "lessonType": "Lecture",
          "day": "Wednesday",
          "startTime": "1000",
          "endTime": "1200",
          "weeks": [1,2,3,4,5,7,8,9,10,11,12,13],
          "venue": "COM3-01-01"
        }
      ]
    }
  ]
}
"""
    return [
        Document(
            page_content=schema_text,
            metadata={
                "source": "nusmods_api_schema",
                "category": "api_schema",
            },
        )
    ]

load_api_schema_docs()[0]


Document(metadata={'source': 'nusmods_api_schema', 'category': 'api_schema'}, page_content='NUSMods module API schema essentials:\n\nEndpoint: https://api.nusmods.com/v2/<acadYear>/modules/<moduleCode>.json\n\nFields:\n- moduleCode, title, description, moduleCredit, faculty, department\n- prerequisite, preclusion, corequisite, prerequisiteTree, fulfillRequirements\n- semesterData: per-semester offerings with timetable + exam info\n\nSemesterData entry includes:\n- semester (int) and optional examDate (ISO string)\n- timetable blocks (classNo, lessonType, day, startTime, endTime, weeks, venue)\n\nInterpretation tips:\n- Use semesterData to see when a module runs and lesson types offered\n- Combine prerequisite + prerequisiteTree for eligibility checks\n- fulfillRequirements shows downstream modules unlocked after passing\n- Empty semesterData means the module is not offered that academic year\n\nExample response for module DSA4213:\n{\n  "moduleCode": "DSA4213",\n  "moduleCredit": "4",\

### Build Datasets

Group curriculum, valid-module, and schema documents so the retrieval layer can access them.

In [61]:
from dataclasses import dataclass

@dataclass
class ContextDataset:
    name: str
    documents: List[Document]
    # method to split documents into chunks with overlap
    def split(self, chunk_size: int = 800, chunk_overlap: int = 150) -> List[Document]:
    # raising errors
        if chunk_size <= 0:
            raise ValueError("chunk_size must be positive")
        if chunk_overlap >= chunk_size:
            raise ValueError("chunk_overlap must be smaller than chunk_size")
        
        # split each document into chunks
        chunks = []
        # for all documents
        for doc in self.documents:
            text = doc.page_content
            start = 0
            while start < len(text):
                # get ending index of chunk
                end = min(len(text), start + chunk_size)
                # get the text in that range from start to end
                chunk_text = text[start:end].strip()
                # if chunk has text
                if chunk_text:
                    # create metadata for the chunk
                    metadata = dict(doc.metadata)
                    # add in additional metadata about the chunk
                    metadata.update({
                        "chunk_start": start,
                        "chunk_end": end,
                        "dataset": self.name,
                    })
                    # append to the chunks as a document
                    chunks.append(Document(page_content=chunk_text, metadata=metadata))
                # stop when reach end of text
                if end >= len(text):
                    break
                # move start to end minus overlap
                start = max(0, end - chunk_overlap)
        return chunks

# start to build dataset
def build_datasets():
    # get all the docs --> using the functions
    curriculum_docs = ContextDataset("curriculum", load_curriculum_docs())
    valid_docs = ContextDataset("valid_modules", valid_mods())
    schema_docs = ContextDataset("api_schema", load_api_schema_docs())
    # return full data
    return {
        curriculum_docs.name: curriculum_docs,
        valid_docs.name: valid_docs,
        schema_docs.name: schema_docs,
    }

datasets = build_datasets()
datasets.keys()
datasets.values()


dict_values([ContextDataset(name='curriculum', documents=[Document(metadata={'source': 'data/dsa_degree_requirements.pdf', 'page': 0, 'category': 'curriculum'}, page_content='Requirements for Major in Data Science and Analytics\nApplicable to cohorts AY2021/2022 and after\nLevels Major Requirements Cumulative \nMajor Units\nPass\n-  DSA1101 Introduction to Data Science1\nPass\n-  CS2040 Data Structures and Algorithms2\n-  DSA2101 Essential Data Analytics Tools: Data Visualisation\n-  DSA2102 Essential Data Analytics Tools: Numerical Computation\n-  MA2001 Linear Algebra I\n-  MA2002 Calculus\n-  MA2311 Techniques in Advanced Calculus\n or MA2104 Multivariable Calculus\n-  ST2131/MA2116/MA2216 Probability\n-  ST2132 Mathematical Statistics\nPass\n-  CS3244 Machine Learning\n-  DSA3101 Data Science in Practice\n-  DSA3102 Essential Data Analytics Tools: Convex Optimisation\n-  ST3131 Regression Analysis\n Choose either Option A or Option B\n Option A – Pass two courses as follows:\n - On

We iterate over the datasets, split into chunks and feed into FAISS.from_documents.

Output: dictionary mapping each dataset name to FAISS vector index --> similarity search against corpus with consistent embedding.

In [65]:
# make sure to pull mxbai-embed-large model
# docker pull mxbai/ollama
# ollama serve 

embedding = OllamaEmbeddings(model="mxbai-embed-large")
# we use FAISS to be consistent with retrieval tool
def build_indexes():
    indexes = {}
    # iterate over datasets
    for name, dataset in datasets.items():
        # skip valid_modules
        if name == "valid_modules":
            continue
        # split dataset into chunks
        splits = dataset.split()
        # feed into FAISS
        store = FAISS.from_documents(splits, embedding)
        # map dataset name to FAISS store
        indexes[name] = store
    return indexes

indexs = build_indexes()
list(indexs.keys())


['curriculum', 'api_schema']

### Helper Queries

Wrappers to fetch curriculum snippets, valid module lists, and schema notes.

In [67]:
# helper to extract scores and enrich docs with similarity score
def scoring(results):
    docs = []
    scores = []
    # for each result
    for doc, score in results:
        # create a new document with the same content and metadata
        enriched = Document(page_content=doc.page_content, metadata=dict(doc.metadata))
        # add the similarity score to the metadata
        enriched.metadata["similarity"] = score
        # add to the docs list
        docs.append(enriched)
        # append the score
        scores.append(score)
    return scores, docs

# search through curriculum 
def search_curriculum(query, top_k):
    # get the scores (similarity_search_with_score: comes from FAISS vector score)
    scores, docs = scoring(
        indexs["curriculum"].similarity_search_with_score(query, k = top_k)
    )
    return scores, docs

# list valid modules
def list_valid_modules(query = None, limit= 50):
    # get the valid modules from dataset
    doc = datasets["valid_modules"].documents[0]
    # get lines from the document
    lines = [line.strip() for line in doc.page_content.splitlines() if line.strip()]
    # separate description and modules
    description, *modules = lines
    # if query exists
    if query:
        # make query uppercase
        query_upper = query.upper()
        # filter modules that contain the query
        modules = [code for code in modules if query_upper in code.upper()]
    else:
        # no query, return all modules
        modules = modules[:limit]
    return description, modules

# explain schema
def explain_schema(query, top_k):
    # get the scores (similarity_search_with_score: comes from FAISS vector score)
    _, docs = scoring(
        indexs["api_schema"].similarity_search_with_score(query, k = top_k)
    )
    return docs

search_curriculum("convex optimisation", top_k = 2)[0], list_valid_modules("DSA4")[1][:5], explain_schema("semesterData", 1)[0].metadata


([266.8778, 289.2295],
 ['DSA4212', 'DSA426X', 'DSA4288', 'DSA4288M', 'DSA4288S'],
 {'source': 'nusmods_api_schema',
  'category': 'api_schema',
  'chunk_start': 0,
  'chunk_end': 800,
  'dataset': 'api_schema',
  'similarity': 166.46776})