### Extract Data


In [1]:
import PyPDF2 

"""
Could have used OCR like https://github.com/opendatalab/MinerU  
if there was many graphs and charts.
"""

DATA_DIR = "../data/For Task - Policy file.pdf"

def extract_text(pdf_path):
    text = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for i, page in enumerate(reader.pages, 1):
            text.append({"page": i, "content": page.extract_text()})
    return text

raw_pages = extract_text(DATA_DIR)

In [2]:
# Split into chunks keep page & paragraph info for better context

""" 
Used RecursiveCharacterTextSplitter which is semantic-preserving hierarchical chunking technique.
It has citations(pages, paragraph, tabel) which will help me find any error. 
"""

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", "!", "?"]
)

docs = []
for p in raw_pages:
    chunks = splitter.split_text(p["content"])
    for para_idx, c in enumerate(chunks, 1):
        docs.append(
            Document(
                page_content=c,
                metadata={"page": p["page"], "paragraph": para_idx}
            )
        )


In [3]:
import os, dotenv

dotenv.load_dotenv()
genai_api = os.getenv("GEMINI_API_KEY") 

### vector store 

In [4]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI 
from langchain_community.vectorstores import Chroma

# used gemini for as embedding and llm model.
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=genai_api
)
vectorstore = Chroma.from_documents(
    docs,
    embeddings,
    persist_directory="../data/chroma_db"
)


### Chat Bot

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.2, # used low temperature as it requires high precision and consistency
    google_api_key=genai_api
)

# Simple prompt but we can add additional rules and use one shot or few shot prompting 
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant for a financial policy document. "
            "Answer questions and include the exact page & paragraph citations.\n\nContext:\n{context}",
        ),
        ("human", "{input}"),
    ]
)


qa_chain   = create_stuff_documents_chain(llm, prompt)
retriever  = vectorstore.as_retriever()
rag_chain  = create_retrieval_chain(retriever, qa_chain)

In [6]:
# Conversation memory


from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {} # It keeps chat history stored in the session_id
chat = RunnableWithMessageHistory(
    rag_chain,
    lambda sid: store.setdefault(sid, ChatMessageHistory()),
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer"
)


In [7]:
# A helper function to get answer from the chat
def ask(q, sid="default"):
    out = chat.invoke({"input": q}, {"configurable": {"session_id": sid}})
    return out["answer"]
    
print(ask("What are the Principles of Responsible Financial Management?"))

The key financial measures established by the Government satisfy various principles of responsible financial management specified within the Financial Management Act 1996, these are: (a) ensuring that the total liabilities of the Territory are at prudent levels to provide a buffer against factors that may impact adversely on the level of total Territory liabilities in the (see page 1, Principles of Responsible Financial Management, paragraph a).


In [10]:
print(ask("What about debt?"))

The policy focuses on ensuring total liabilities are at prudent levels and that operating expenses do not exceed operating income. (a) and (b)


In [11]:
print(ask("when did the Government has a commitment to fund accrued superannuation liabilities?"))

The Government has a commitment to fund 90% of accrued superannuation liabilities by 30 June 2040 (Page 1.2.6, Paragraph "90% coverage of accrued superannuation liabilities by 2039-40").
