In [1]:
from PyPDF2 import PdfReader
import re
import os
import glob
from dotenv import load_dotenv
import numpy as np
import gradio as gr
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import plotly.graph_objects as go

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI

from openai import OpenAI


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# change dir to parent
os.chdir("../")

In [3]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    


API key looks good so far


In [4]:
# constants
db_name = "budget_vector_db"
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [None]:
#config
pdf_input = r'knowledge_base\2025\fy2025_budget_statement.pdf'
source_name = "fy2025_budget_statement.pdf"
doc_type = "budget_statement"


## Sections extraction from table of contents

In [5]:
# extract table of contents
def extract_sections_from_toc(pdf_path, toc_pages=(0, 1)):
    reader = PdfReader(pdf_path)
    sections = []
    for i in range(toc_pages[0], toc_pages[1]+1):
        page = reader.pages[i]
        text = page.extract_text()
        for line in text.split('\n'):
            match = re.match(r'^([A-Z]\.\s.+?)\.{3,}\s+(\d+)$', line.strip())
            if match:
                title = match.group(1).strip()
                page_num = int(match.group(2))
                sections.append((title, page_num))
    return sections

# cleaning
def clean_section_titles(raw_sections):
    cleaned = []
    for raw_title, page_num in raw_sections:
        # Remove leading "A. ", "B. ", etc.
        title = re.sub(r'^[A-Z]\.\s+', '', raw_title)

        # Remove trailing dot runs or dot-space-dot junk
        title = re.sub(r'\.{2,}', '', title)

        # Collapse internal double spaces (in case)
        title = re.sub(r'\s{2,}', ' ', title)

        # Final trim
        title = title.strip()

        cleaned.append((title, page_num))
    return cleaned


In [None]:
raw_sections = extract_sections_from_toc(pdf_input)
sections = clean_section_titles(raw_sections)
print(sections)

## Text extraction and cleaning

In [6]:
def extract_page_texts(pdf_path, start_page=0, page_offset = 0):
    """
    Extracts raw text from each page starting from `start_page`.
    Returns a list of dicts: [{page_num, text}]
    """
    reader = PdfReader(pdf_path)
    pages = []

    for page_num in range(start_page, len(reader.pages)):
        raw_text = reader.pages[page_num].extract_text()
        if not raw_text:
            continue

        pages.append({
            "page_num": page_num + page_offset,
            "text": raw_text
        })

    return pages


def clean_text(text):
    """
    Cleans page text by removing footers like 'Page 3 of 86'.
    """
    cleaned_lines = []
    for line in text.split('\n'):
        # Remove footer if line contains "Page x of y"
        if re.search(r'\bPage\s*\d+\s+of\s+\d+\b', line, re.IGNORECASE):
            continue
        cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines).strip()


In [None]:
raw_pages = extract_page_texts(pdf_input, start_page=2, page_offset=1)  # Skip ToC

# Apply cleaning
cleaned_pages = []
for page in raw_pages:
    cleaned_pages.append({
        "page_num": page["page_num"],
        "text": clean_text(page["text"])
    })


In [None]:
cleaned_pages[:2]

## build the metadata

In [7]:
def build_metadata_tuples(sections, cleaned_pages, offset=0, source=None, doc_type=None):
    """
    Assumes sections is a list of tuples (section_name, start_page) using the original numbering.
    If cleaned_pages have an offset (e.g. missing TOC), subtract the offset from the section start pages.
    
    Parameters:
      sections (list of tuple): e.g. [('Introduction', 3), ...]
      cleaned_pages (list of dict): Each with keys 'page_num' and 'text'.
      offset (int): The number to subtract from each section's start page.
      source (str, optional): The base file name or identifier for the document.
      doc_type (str, optional): A string indicating the document type (e.g., "budget_statement").
                    
    Returns:
      List of dict: Each page dict includes its original keys plus added metadata:
                    'section', 'source' (if provided), and 'doc_type' (if provided).
    """
    # Adjust the section start pages using the offset.
    adjusted_sections = [(name, start_page - offset) for name, start_page in sections]
    
    # Sort the sections by the adjusted start page.
    sections_sorted = sorted(adjusted_sections, key=lambda x: x[1])
    
    # Determine the maximum page number among cleaned pages.
    max_page = max(page['page_num'] for page in cleaned_pages)
    
    # Build section ranges: (section_name, start_page, end_page)
    section_ranges = []
    for idx, (section_name, start_page) in enumerate(sections_sorted):
        if idx < len(sections_sorted) - 1:
            next_start = sections_sorted[idx + 1][1]
            end_page = next_start - 1
        else:
            end_page = max_page
        section_ranges.append((section_name, start_page, end_page))
    
    # Annotate each cleaned page with its section and additional metadata.
    metadata_pages = []
    for page in cleaned_pages:
        page_num = page['page_num']
        assigned_section = "Not Assigned"
        for section_name, start_page, end_page in section_ranges:
            if start_page <= page_num <= end_page:
                assigned_section = section_name
                break
        page_with_metadata = page.copy()
        page_with_metadata['section'] = assigned_section
        if source is not None:
            page_with_metadata['source'] = source
        if doc_type is not None:
            page_with_metadata['doc_type'] = doc_type
        metadata_pages.append(page_with_metadata)
    
    return metadata_pages


In [None]:
metadata_pages = build_metadata_tuples(sections, cleaned_pages, offset=0, source=source_name, doc_type=doc_type)


In [None]:
metadata_pages[:3]

## Chunk

In [8]:
def langchain_chunk_texts(metadata_pages, chunk_size=1000, chunk_overlap=200):
    """
    Uses LangChain's RecursiveCharacterTextSplitter to split each page's text into chunks
    with a specified size and overlap. Each chunk is enriched with metadata from the original page.

    Parameters:
      metadata_pages (list): List of dicts with keys 'page_num', 'text', and 'section'.
      chunk_size (int): Maximum number of characters per chunk.
      chunk_overlap (int): Overlap in characters between consecutive chunks.
      
    Returns:
      List of dict: Each dict represents a chunk with keys 'chunk_text', 'section', and 'page_nums'.
    """
    # Initialize the splitter with the desired parameters.
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " "]  # Order of preference for splitting.
    )
    
    chunks = []
    for page in metadata_pages:
        # Use the splitter to divide the text into chunks.
        splits = splitter.split_text(page['text'])
        for chunk in splits:
            chunks.append({
                "content": chunk,
                "section": page['section'],
                "page_nums": page['page_num'],
                "source":page["source"],
                "doc_type": page["doc_type"]
            })
    return chunks

In [None]:
chunks_temp = langchain_chunk_texts(metadata_pages, chunk_size=1000, chunk_overlap=200)
chunks_temp[0]

In [None]:
len(chunks_temp)

In [9]:
def convert_chunks_to_documents(chunks):
    """
    Converts chunk dictionaries to a new structure with separate context and metadata.

    Parameters:
      chunks (list of dict): Each dict contains keys "content", "section", "page_nums", etc.
      source (str, optional): The source file name.
      doc_type (str, optional): The document type (e.g., "budget_statement").

    Returns:
      List of dict: Each dict has 'context' for the text and 'metadata' for the associated metadata.
    """
    docs = []
    for chunk in chunks:
        doc = {
            "context": chunk["content"],
            "metadata": {
                "section": chunk["section"],
                "page_num": chunk["page_nums"],
                "source": chunk["source"],
                "doc_type": chunk["doc_type"]
            }
        }
        docs.append(doc)
    return docs




In [None]:
chunks = convert_chunks_to_documents(chunks_temp)
chunks[0]

## embeddings

In [None]:
# Convert to LangChain Document objects
from langchain_core.documents import Document

docs = [
    Document(
        page_content=chunk["context"],
        metadata=chunk["metadata"]
    )
    for chunk in chunks
]

# Initialize embeddings
embeddings = OpenAIEmbeddings()  # or HuggingFaceEmbeddings(...) if you prefer free

# Delete previous DB if exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create and persist vectorstore
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=db_name
)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")


## Visualize

In [None]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

In [None]:
# Prework (with thanks to Jon R for identifying and fixing a bug in this!)

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue'][['budget_statement'].index(t)] for t in doc_types]

In [None]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## CHat

In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', output_key='answer', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})


In [None]:
system_msg = (
    "You are a knowledgeable assistant that provides clear, concise insights on Singapore’s budgets. "
    "You may compare and reference details from any provided documents. If the user asks about details "
    "beyond these documents, explain that the information is not available. Always base your answers on the "
    "documents you have, and do not speculate or fabricate information. When making comparisons or referencing "
    "specific details, cite the relevant sections or years to support your explanation."
)

prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_msg),
    HumanMessagePromptTemplate.from_template(
        "Here are some documents to help you:\n\n{context}\n\nNow answer the question: {question}"
    )
])

In [None]:
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    memory=memory, 
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": prompt}  # inject custom system prompt

)

# debug
result = conversation_chain.invoke({"question": "describe the tax changes"})
result

In [None]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    answer = result.get("answer", "")

    source_docs = result.get("source_documents", [])
    if source_docs:
        metadata_lines = []
        for i, doc in enumerate(source_docs, start=1):
            md = doc.metadata
            src = md.get("source", "Unknown source")
            sec = md.get("section", "Unknown section")
            pg = md.get("page_num", "Unknown page")
            metadata_lines.append(f"**Document {i}:** Source: {src} | Section: {sec} | Page: {pg}")

        answer += "\n\n**Sources:**\n" + "\n".join(metadata_lines)
    else:
        answer += "\n\n**NO SOURCE:**\n"
    return answer


In [None]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=False)

## Combine the functions

In [10]:
# Initialize and constants
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
db_name = "budget_vector_db"
MODEL = 'gpt-4o-mini'

# Find all PDF files in the knowledge_base directory and its subdirectories.
pdf_files = glob.glob(r'knowledge_base/**/*.pdf', recursive=True)

all_chunks = []

for pdf_path in pdf_files:
    # Extract source name from the file path.
    source_name = os.path.basename(pdf_path)
    
    # Extract year from the file path and set doc_type
    year_match = re.search(r'fy(\d{4})_budget_statement\.pdf', source_name)
    if year_match:
        year = year_match.group(1)
        doc_type = f"budget_statement_{year}"  # or just year, if you prefer
    else:
        doc_type = "budget_statement"  # Default doc_type if year extraction fails

    # Extract sections and clean them.
    raw_sections = extract_sections_from_toc(pdf_path)
    sections = clean_section_titles(raw_sections)

    # Extract page texts and clean them.
    raw_pages = extract_page_texts(pdf_path, start_page=2, page_offset=1)
    cleaned_pages = [{
        "page_num": page["page_num"],
        "text": clean_text(page["text"])
    } for page in raw_pages]

    # Build metadata.
    metadata_pages = build_metadata_tuples(sections, cleaned_pages, offset=0, source=source_name, doc_type=doc_type)

    # Chunk the text.
    chunks_temp = langchain_chunk_texts(metadata_pages, chunk_size=1000, chunk_overlap=200)

    # Convert chunks to documents.
    chunks = convert_chunks_to_documents(chunks_temp)

    # Append chunks to all_chunks
    all_chunks.extend(chunks)


In [None]:
# Convert to LangChain Document objects
from langchain_core.documents import Document

docs = [
    Document(
        page_content=chunk["context"],
        metadata=chunk["metadata"]
    )
    for chunk in all_chunks
]

# Initialize embeddings
embeddings = OpenAIEmbeddings()  # or HuggingFaceEmbeddings(...) if you prefer free

# Delete previous DB if exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create and persist vectorstore
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=db_name
)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")




  Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()


Vectorstore created with 242 documents


In [13]:
# Convert to LangChain Document objects
docs = [Document(page_content=chunk["context"], metadata=chunk["metadata"]) for chunk in all_chunks]

# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Delete previous DB if exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create and persist vectorstore
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=db_name)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")



Vectorstore created with 242 documents


In [14]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', output_key='answer', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG; k is how many chunks to use
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})


  llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
  memory = ConversationBufferMemory(memory_key='chat_history', output_key='answer', return_messages=True)


In [51]:
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
import re

# ✅ Strict but flexible system message
system_msg = (
    "You are a highly reliable assistant answering questions based on Singapore’s FY2024 and FY2025 budget documents. "
    "You MUST use only the content from these documents. You MAY summarize and synthesize across them, including comparing statistics across FY2024 and FY2025 when both are available in the retrieved content. Do not guess or fabricate any data. "
    "If the answer is not even indirectly supported by the documents, respond: 'I do not have sufficient information based on the provided documents.' "
    "Only mention fiscal years FY2024 and FY2025 unless another year is explicitly stated in the documents."
)

# ✅ Flexible user instruction (no hardcoded fallback terms like GDP)
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_msg),
    HumanMessagePromptTemplate.from_template(
        "Here are some documents to help you:\n\n{context}\n\n"
        "Now answer the question: {question}\n\n"
        "If the documents do not directly answer the question, but contain related figures or closely associated statistics, you MAY include them with a clear disclaimer explaining how they are related."
    )
])

# ✅ Conversation memory
memory = ConversationBufferMemory(
    memory_key='chat_history',
    output_key='answer',
    return_messages=True
)

# ✅ Main function
def chat(question: str, history) -> str:
    # Try to detect specific year
    match = re.search(r'fy\s*(20\d{2})|(?:[^0-9]|^)(20\d{2})(?:[^0-9]|$)', question.lower())
    year = match.group(1) or match.group(2) if match else None

    # ✅ Comparison detection — don't filter if comparing both years
    if "compare" in question.lower() and "2024" in question and "2025" in question:
        filter_kwargs = {}
    elif year in {"2024", "2025"}:
        target_source = f"fy{year}_budget_statement.pdf"
        filter_kwargs = {"where": {"source": {"$eq": target_source}}}
    else:
        filter_kwargs = {}

    print(f"[DEBUG] Running retrieval with filter: {filter_kwargs.get('where', 'None')}")

    # Initial retriever
    retriever = vectorstore.as_retriever(search_kwargs={
        "k": 25,
        **({"filter": filter_kwargs["where"]} if "where" in filter_kwargs else {})
    })
    docs = retriever.invoke(question)

    # ✅ Retry without filter if nothing found
    if not docs and filter_kwargs:
        print("[DEBUG] No results with filter. Retrying without filter...")
        retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
        docs = retriever.invoke(question)

    print(f"[DEBUG] Retrieved {len(docs)} documents")
    print(f"[DEBUG] Sources in retrieval: {[doc.metadata.get('source') for doc in docs]}")

    for i, doc in enumerate(docs, 1):
        print(f"[Doc {i}] {doc.metadata.get('source', 'Unknown')} | "
              f"Section: {doc.metadata.get('section', 'Unknown')} | "
              f"Page: {doc.metadata.get('page_num', 'Unknown')}")

    # ✅ Setup chain
    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        return_source_documents=True,
        combine_docs_chain_kwargs={"prompt": prompt},
        output_key="answer"
    )

    result = chain.invoke({"question": question})
    answer = result.get("answer", "")
    source_docs = result.get("source_documents", [])

    # ✅ Format citations
    metadata_lines = []
    for i, doc in enumerate(source_docs, 1):
        md = doc.metadata
        src = md.get("source", "Unknown source")
        sec = md.get("section", "Unknown section")
        pg = md.get("page_num", "Unknown page")
        metadata_lines.append(f"**Doc {i}:** {src} | {sec} | Page {pg}")

    if metadata_lines:
        answer += "\n\n**Sources:**\n" + "\n".join(metadata_lines)
    else:
        answer += "\n\n**No Sources Found**"

    return answer


In [None]:
# And in Gradio:

view = gr.ChatInterface(chat, type="messages").launch(inbrowser=False)

* Running on local URL:  http://127.0.0.1:7876

To create a public link, set `share=True` in `launch()`.


[DEBUG] Running retrieval with filter: None
[DEBUG] Retrieved 25 documents
[DEBUG] Sources in retrieval: ['fy2024_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2024_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2025_budget_statement.pdf', 'fy2024_budget_statement.pdf']
[Doc 1] fy2024_budget_statement.pdf | Section: Securing Our Fiscal Position | Page: 80
[Doc 2] fy2025_budget_statement.

In [None]:
what is the budget deficit for 2024



In [42]:
query = "what is the budget deficit for 2025"
docs = vectorstore.as_retriever(search_kwargs={"k": 10}).get_relevant_documents(query)
for d in docs:
    print(d.metadata)
    print(d.page_content[:400])


{'doc_type': 'budget_statement_2024', 'page_num': 80, 'section': 'Securing Our Fiscal Position', 'source': 'fy2024_budget_statement.pdf'}
we expect to end FY2023 with a deficit of $3.6 billion , or 0.5%  of GDP .  
 
172. For FY2024, we are budgeting a small surplus of $0.8 billion, or 
0.1% of GDP , which is essentially a balanced fiscal position. The 
overall stance is appropriate , as we are  provid ing targeted support
{'doc_type': 'budget_statement_2024', 'page_num': 80, 'section': 'Securing Our Fiscal Position', 'source': 'fy2024_budget_statement.pdf'}
economies where public finances are on an unsustainable path, and 
fiscal systems are at risk of breaking. We must never allow this to 
happen in Singapore. Instead , let us uphold the ethos of fiscal 
discipline and responsibility that has served us well, and ensure  that 
our fiscal position  always  remains balanced , sound, and sustainable.   
 
FY2023 and FY2024 Fiscal Position  
 
168. Sir, 
{'doc_type': 'budget_statement_202

In [None]:
query = "What does the FY2025 budget document say about government spending as a % of GDP"
docs = vectorstore.as_retriever(search_kwargs={"k": 100}).get_relevant_documents(query)

for d in docs:
    print(f"{d.metadata['source']} | Page {d.metadata['page_num']} | {d.metadata['section']}")
    print(d.page_content[:300], "\n---\n")
