We first load in the PDF documents and chunk them.

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

pdf_folder = "data"

pdf_docs = []

for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()

        # Optional: tag documents with source info
        for doc in docs:
            doc.metadata["source"] = filename

        pdf_docs.extend(docs)

print(f"Loaded {len(pdf_docs)} pages from {len(os.listdir(pdf_folder))} PDF files.")

# Split documents into smaller chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # size per chunk in characters
    chunk_overlap=200, # overlap between consecutive chunks
    length_function=len,
    separators=["\n\n", "\n", ".", " "]
)

pdf_chunks = splitter.split_documents(pdf_docs)
print(f"Total chunks created: {len(pdf_chunks)}")

Loaded 5 pages from 2 PDF files.
Total chunks created: 12


Chunk the JSON CHS Requirements.

In [4]:
import json

json_path = "data/dsa_chs_requirements.json"

with open(json_path, "r", encoding="utf-8") as f:
    module_data = json.load(f)

json_docs = []

chs_curr = module_data['chs_common_curriculum']

common_core = chs_curr['common_core']
integrated = chs_curr['integrated_courses']
interdisciplinary = chs_curr['interdisciplinary_courses']
year1_preallocation = chs_curr['year1_preallocation']


For common core modules, the format is more complicated. It contains a list of common core pillars with varying depths of subdictionaries.
- 6 Pillars: Data Literacy, Communities and Engagement, Artificial Intelligence, Design Thinking, Digital Literacy, Writing.
- All contains 2 keys: 'pillar' and 'course_options' but communities and engagement contains deeper subdictionaries with different categories of C&E modules. 

For integrated and interdisciplinary modules, each sub-curriculum is a list where each element is a dictionary with keys 'pillar' and 'course_options'.

For year 1 preallocated modules, it contains a dictionary with 2 keys, each representing the group of students of which the following modules are preallocated for.

In [16]:
cc_data = common_core[0] # Data Literacy Pillar
cc_ce = common_core[1] # Communities and Engagement Pillar
cc_ai = common_core[2] # Artificial Intelligence Pillar
cc_dt = common_core[3] # Design Thinking Pillar
cc_digi = common_core[4] # Digital Literacy Pillar
cc_writing = common_core[5] # Writing Pillar

cc_ce['course_options'][0]


{'category': 'Project-based Engagement-Learning C&E courses',
 'subcategories': [{'gen-coded': False,
   'courses': [{'code': 'BN4102', 'title': 'Gerontechnology in Ageing'},
    {'code': 'BN4103',
     'title': 'Assistive Technology for Persons with Disability'},
    {'code': 'CDE2001',
     'title': 'Innovation and Design for Communities',
     'note': 'Only offered to CDE students.'},
    {'code': 'CLC1101', 'title': 'Engaging and Building Communities'},
    {'code': 'CLC2204', 'title': 'Community Development with Youth'},
    {'code': 'CLC3303', 'title': 'Community Leadership'},
    {'code': 'CLC3304A', 'title': 'City, Culture and Community'},
    {'code': 'CLC3307', 'title': 'Learning for Social Change'}]},
  {'gen-coded': True,
   'courses': [{'code': 'GEN2000',
     'title': 'Living Culture: Engaging Indian Communities in Singapore'},
    {'code': 'GEN2001', 'title': 'Theatre and Community Engagement'},
    {'code': 'GEN2002X',
     'title': 'Total Defence Project Against Threat

In [17]:
from langchain_core.documents import Document

chs_curr_json = []

for pillars in common_core:
    pillar_name = pillars['pillar']
    if pillar_name == 'Communities and Engagement':
        course_options = pillars['course_options']

        for categories in course_options:
            category_name = categories['category']
            subcategories = categories['subcategories']

            for sub in subcategories:
                if 'gen-coded' in sub:
                    gen_coded = sub.get('gen-coded')
                    module_info = sub['courses']

                    modules_text = []
                    
                    for mod in module_info:
                        modules_text.append(f"{mod['code']} ({mod['title']})")
                    
                    content = (
                        f"Group: CHS Common Core Modules\n"
                        f"Pillar Name: {pillar_name}\n"
                        f"Category: {category_name}\n"
                        f"Gen-Coded: {gen_coded}\n"
                        f"Modules: {', '.join(modules_text)}"
                    )
                
                elif 'semester' in sub:
                    sem = sub.get('semester')
                    module_info = sub['courses']

                    modules_text = []
                    
                    for mod in module_info:
                        modules_text.append(f"{mod['code']} ({mod['title']})")
                    
                    content = (
                        f"Group: CHS Common Core Modules\n"
                        f"Pillar Name: {pillar_name}\n"
                        f"Category: {category_name}\n"
                        f"Semester: {sem}\n"
                        f"Modules: {', '.join(modules_text)}"
                    )

                chs_curr_json.append(
                    Document(
                        page_content=content,
                        metadata = {"group": "CHS Common Core Modules", "pillar": pillar_name, "category": category_name}
                    )
                )
    else:
        course_options = pillars['course_options']

        modules_text = []

        for mod in course_options:
            if 'footnote' in mod:
                modules_text.append(f"{mod['code']} ({mod['footnote']})")
            
            else:
                modules_text.append(mod['code'])

        content = (
            f"Group: CHS Common Core Modules\n"
            f"Pillar: {pillar_name}\n"
            f"Modules: {', '.join(modules_text)}"
        )

        chs_curr_json.append(
                Document(
                    page_content=content,
                    metadata={"group": "CHS Common Core Modules", "pillar": pillar_name}
                )
            )

for pillars in integrated:
    pillar_name = pillars['pillar']
    course_options = pillars['course_options']
    module_codes = [mod['code'] for mod in course_options]

    content = (
            f"Group: CHS Integrated Modules\n"
            f"Pillar: {pillar_name}\n"
            f"Modules: {', '.join(module_codes)}"
        )
    
    chs_curr_json.append(
        Document(page_content=content, metadata = {"group": "CHS Integrated Modules", "pillar": pillar_name})
    )

for pillars in interdisciplinary:
    pillar_name = pillars['pillar']
    course_options = pillars['course_options']
    module_codes = [mod['code'] for mod in course_options]

    content = (
            f"Group: CHS Interdisciplinary Modules\n"
            f"Pillar: {pillar_name}\n"
            f"Modules: {', '.join(module_codes)}"
        )
    
    chs_curr_json.append(
        Document(page_content=content, metadata = {"group": "CHS Interdisciplinary Modules", "pillar": pillar_name})
    )
    
for student_group, preallocation in year1_preallocation.items():
    sem1_modules = preallocation['semester_1']
    sem2_modules = preallocation['semester_2']

    student = "Student ID ending with odd number" if student_group == "student_id_ending_odd" else "Student ID ending with even number"

    content = (
        f"CHS Allocated Student Group: {student}\n"
        f"Semester 1 Modules: {', '.join(sem1_modules)}\n"
        f"Semester 2 Modules {', '.join(sem2_modules)}"
    )

    chs_curr_json.append(
        Document(page_content=content, metadata = {"group": "CHS Year 1 Preallocated Modules"})
    )


In [18]:
chs_curr_json

[Document(metadata={'group': 'CHS Common Core Modules', 'pillar': 'Data Literacy'}, page_content='Group: CHS Common Core Modules\nPillar: Data Literacy\nModules: GEA1000, DSA1101, ST1131, DSE1101 (DSE1101 can only be read by students in the Data Science and Economics programme.), BT1101'),
 Document(metadata={'group': 'CHS Common Core Modules', 'pillar': 'Communities and Engagement', 'category': 'Project-based Engagement-Learning C&E courses'}, page_content='Group: CHS Common Core Modules\nPillar Name: Communities and Engagement\nCategory: Project-based Engagement-Learning C&E courses\nGen-Coded: False\nModules: BN4102 (Gerontechnology in Ageing), BN4103 (Assistive Technology for Persons with Disability), CDE2001 (Innovation and Design for Communities), CLC1101 (Engaging and Building Communities), CLC2204 (Community Development with Youth), CLC3303 (Community Leadership), CLC3304A (City, Culture and Community), CLC3307 (Learning for Social Change)'),
 Document(metadata={'group': 'CHS C

Combine all information.

In [19]:
all_info = pdf_chunks + chs_curr_json

Create an embedding store using OllamaEmbeddings and FAISS.

In [20]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

embedding = OllamaEmbeddings(model="mxbai-embed-large")  # or 'nomic-embed-text'
vectorstore = FAISS.from_documents(all_info, embedding)
vectorstore.save_local("curriculum_info_vectors")

test drive

In [1]:
from langchain_core.messages import HumanMessage, AIMessage
from langgraph.graph import StateGraph, START, END
from langgraph.graph import START, MessagesState, StateGraph
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

embedding = OllamaEmbeddings(model="mxbai-embed-large") 

vectorstore = FAISS.load_local("curriculum_info_vectors", embedding, allow_dangerous_deserialization=True)  # pass the same embedding instance
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

class ModState(MessagesState):
    messages: any = None
    retrieved_docs: any = None


llm = ChatOllama(
    model="qwen3:14b",
    temperature=0.2,
    num_predict=-1,
    reasoning=True,
    validate_model_on_init=True,
)


def retrieve_node(state):
    print("üîπ Running retrieve_node")
    query = state["messages"][-1].content  # last user message
    docs = retriever.invoke(query)
    print(f"Data type of docs: {type(docs)}")
    state["retrieved_docs"] = docs
    state["messages"].append(AIMessage(content=f"Retrieved {len(docs)} document(s)."))
    print(f"Retrieved {len(docs)} document(s).")
    return state


def llm_node(state):
    print("üîπ Running llm_node")
    print(state)
    context = "\n\n".join([d.page_content for d in state["retrieved_docs"]])
    query = state["messages"][0].content

    prompt = f"""You are a university module planning assistant.
    Use the provided context below to answer questions accurately about modules, pillars, or degree requirements.

    Context:
    {context}

    Question:
    {query}
    """

    response = llm.invoke([HumanMessage(content=prompt)])
    print(f"Response: {response}")

    state['messages'].append(AIMessage(content=response.content))
    return state

graph = StateGraph(ModState)

# Add nodes
graph.add_node("retrieve", retrieve_node)
graph.add_node("llm", llm_node)

# 3Ô∏è‚É£ Connect edges (define flow)
graph.add_edge(START, "retrieve")
graph.add_edge("retrieve", "llm")
graph.add_edge("llm", END)

# 4Ô∏è‚É£ Compile graph
app = graph.compile()

# 5Ô∏è‚É£ Test run
state = {
    "messages": [HumanMessage(content="What are the modules in common core communities and engagement pillar?")],
    "retrieved_docs": None
}

result = app.invoke(state)

# 6Ô∏è‚É£ Print results
for msg in result["messages"]:
    print(f"{msg.type.upper()}: {msg.content}")

  embedding = OllamaEmbeddings(model="mxbai-embed-large")
  llm = ChatOllama(


üîπ Running retrieve_node
Data type of docs: <class 'list'>
Retrieved 4 document(s).
üîπ Running llm_node
{'messages': [HumanMessage(content='What are the modules in common core communities and engagement pillar?', additional_kwargs={}, response_metadata={}), AIMessage(content='Retrieved 4 document(s).', additional_kwargs={}, response_metadata={})], 'retrieved_docs': [Document(id='1e577e70-b363-41ad-9df6-cf2b913473ee', metadata={'group': 'Common Core Modules', 'pillar': 'Communities and Engagement', 'category': 'Service-Learning C&E courses'}, page_content='Group: Common Core Modules\nPillar Name: Communities and Engagement\nCategory: Service-Learning C&E courses\nGen-Coded:  \nModules: GEN2050Y (Teach SG), GEN2060Y (Reconnect SeniorsSG), GEN2061Y (Support Healthy AgeingSG), GEN2062Y (Community Activities for Seniors with SG Cares), GEN2070Y (Community Link (Comlink) Befrienders)'), Document(id='4cc4ab6b-a9be-4941-88c8-8c105f266324', metadata={'group': 'Common Core Modules', 'pillar'