EXTRACTING TEXT FROM PDF FILES

In [2]:
# batch_pdf_to_text_pypdf.py
from pypdf import PdfReader
import os

# Input and output directories
input_dir = "../data/raw/SARB"
output_dir = "../data/text/"
os.makedirs(output_dir, exist_ok=True)

# Loop over all PDFs in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_dir, filename)
        text_path = os.path.join(output_dir, filename.replace(".pdf", ".txt"))

        all_text = ""
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  # skip pages with no extractable text
                all_text += page_text + "\n"

        # Write extracted text to a .txt file
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(all_text)

        print(f"Extracted {filename}")


Extracted Second Edition 2024 Financial Stability Review_Final_.pdf
Extracted sarb-2024-25.pdf
Extracted MPROCT2024INTERNET.pdf
Extracted Monetary Policy Review April 2025.pdf
Extracted First Edition 2025 Financial Stability Review_1.pdf
Extracted SARB Annual Financial Statements 2023-24.pdf
Extracted Tax chronology 2025 Final.pdf


CLEANING TEXT

In [3]:
import os
import re

def clean_and_normalize(text):
    # Lowercase
    text = text.lower()
    
    # Remove headers/footers/page numbers
    text = re.sub(r'page \d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'sarb.*report', '', text, flags=re.IGNORECASE)
    
    # Remove copyright and boilerplate
    text = re.sub(r'[©®]', '', text)
    text = re.sub(r'south african reserve bank', '', text, flags=re.IGNORECASE)
    text = re.sub(r'all rights reserved', '', text, flags=re.IGNORECASE)

    # --- Normalize currency formats (e.g., 'R3.4 billion' -> '3,400,000,000 ZAR') ---
    
    def normalize_rand_scale(match):
        value = float(match.group(1).replace(',', ''))
        scale = match.group(2).lower()
        
        if scale == 'trillion':
            return f"{int(value * 1_000_000_000_000):,d} ZAR"
        elif scale == 'billion':
            return f"{int(value * 1_000_000_000):,d} ZAR"
        elif scale == 'million':
            return f"{int(value * 1_000_000):,d} ZAR"
        
        return match.group(0) # Return original if no match

    # The pattern now captures the number and the scale word ('million', 'billion', 'trillion')
    text = re.sub(r'r\s?([\d\.]+) (million|billion|trillion)', normalize_rand_scale, text, flags=re.IGNORECASE)
    
    # Normalize simple 'R100' format to '100 ZAR'
    # This regex is more robust, handling commas and decimals in simple numbers
    text = re.sub(r'r([\d,\.]+)', r'\1 ZAR', text, flags=re.IGNORECASE)

    # Remove multiple newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ ]+', ' ', text)

    # Fix words split by hyphens at the end of a line
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

# --- Batch processing ---
input_dir = "../data/text/"
output_dir = "../data/clean/"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(input_dir, filename), "r", encoding="utf-8") as f:
            text = f.read()
        
        cleaned_text = clean_and_normalize(text)

        # Save cleaned text
        with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f:
            f.write(cleaned_text)  # write tokenized version as a single string
        
        print(f"Cleaned {filename}")


Cleaned MPROCT2024INTERNET.txt
Cleaned Second Edition 2024 Financial Stability Review_Final_.txt
Cleaned sarb-2024-25.txt
Cleaned Monetary Policy Review April 2025.txt
Cleaned Tax chronology 2025 Final.txt
Cleaned SARB Annual Financial Statements 2023-24.txt
Cleaned First Edition 2025 Financial Stability Review_1.txt


CHUNKING 

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 1. Specify the path to your text file
file_path = "../data/clean/sarb-2024-25.txt"

# 2. Read the entire content of the file into a single string
try:
    with open(file_path, "r", encoding="utf-8") as file:
        sarb_document_text = file.read()
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    exit()
except Exception as e:
    print(f"An error occurred: {e}")
    exit()

# 2. Initialize the RecursiveCharacterTextSplitter
# You can adjust chunk_size and chunk_overlap to fit your needs.
# A chunk_size of 500 characters is a good starting point for detailed documents.
# A chunk_overlap of 50 characters helps maintain context between chunks.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

# 3. Split the document into chunks
docs = text_splitter.create_documents([sarb_document_text])

# 4. Print the resulting chunks to inspect them
for i, doc in enumerate(docs):
    print(f"--- Chunk {i+1} ---")
    print(doc.page_content)
    print("\n")

--- Chunk 1 ---
annual report 2024/25
price and financial stability 
for sustainable growth
 annual report 2024/25 1 
introduction
sarb overview 2
about this report 4
price and financial stability for


--- Chunk 2 ---
price and financial stability for 
sustainable growth
what the sarb does 6
about price and financial stability 10 
 
delivering the sarb strategy
governor’s message 14


--- Chunk 3 ---
governor’s message 14 
connecting past successes to future opportunities
 18 
 
how the sarb is governed
shareholding and dividend 30 
governance
 31 
risk management
 43 
 
the sarb’s performance


--- Chunk 4 ---
risk management
 43 
 
the sarb’s performance
monetary policy under high uncertainty 46 
responding to climate change risks 52 
maintaining financial stability 53 
protecting depositors 58


--- Chunk 5 ---
protecting depositors 58 
prudential regulation 60 
payments: the foundation of the financial system 66 
fintech in focus 68 
g20: leading the finance track 69 
people matte

embedding and storing in chromadb

In [5]:
import chromadb
import uuid
from chromadb.config import Settings
import math
# Assume you have your list of cleaned text chunks
# from your previous processing steps.
# e.g., text_chunks = ["chunk 1", "chunk 2", ...]
text_chunks = [doc.page_content for doc in docs]


# Use Chroma Cloud settings
client = chromadb.CloudClient(
  api_key='ck-FqwwNfk7hHYQH7anJqjFpZwp5gM2xGdxK9e4zmmYyyvv',
  tenant='d834b4aa-3aee-4c55-89ec-b6dd1ed1c96d',
  database='financial-docs-project'
)

# Create or get your collection
collection = client.get_or_create_collection(name="sarb_documents")

# Prepare your data for insertion
documents = text_chunks # This is the list of your cleaned text chunks

demo_docs = documents[:50]  # or [:10]
demo_ids = [str(uuid.uuid4()) for _ in demo_docs]
demo_metadatas = [{"source": "sarb_document"}] * len(demo_docs)

collection.add(
    documents=demo_docs,
    ids=demo_ids,
    metadatas=demo_metadatas
)

print(f"Added {len(demo_docs)} chunks to Chroma Cloud for demo.")


Added 50 chunks to Chroma Cloud for demo.


RAG 

In [6]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


# Use the same embedding model as before
embedding_model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
embedding_function = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Connect to your cloud collection
vectordb = Chroma(
    embedding_function=embedding_function,
    collection_name="sarb_documents",  # must match your cloud collection
    client=client
)

# Create a retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
print("Retriever connected to Chroma Cloud successfully.")

  vectordb = Chroma(


Retriever connected to Chroma Cloud successfully.


In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline

# Choose your FLAN-T5 model size
model_name = "google/flan-t5-base"  # much faster, uses less RAM


# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create a text-to-text generation pipeline
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=150 
)

# Wrap the Hugging Face pipeline in a LangChain LLM object
llm = HuggingFacePipeline(pipeline=pipe)
print("LLM initialized successfully.")

Device set to use cuda:0


LLM initialized successfully.


  llm = HuggingFacePipeline(pipeline=pipe)


In [18]:
from langchain.chains import RetrievalQA

#prompt_template = """
# You are a financial analyst AI. Generate exactly ONE question AND its answer based on the context. 
# - The question should be clear and specific. 
# - The answer should be short but informative. 
# - Use the following format exactly:
# Question: <your question
# Answer: <your answer>
# Context:
# {context}"""


num_qa_pairs_to_generate = 20
limited_docs = docs[:num_qa_pairs_to_generate]

qa_pairs = []

for doc in limited_docs:
    # --- Step 1: Generate the question ---
    prompt_q = f"""
        You are a financial analyst AI. Read the following text and generate ONE high-quality question that:
        - asks about a specific fact, figure, or policy in the context
        - DO NOT ask for the main idea or a title.
        - Is clear and concise
        - Does NOT ask for a title or trivial detail
        - Can be answered from the context

        Context:
        {doc.page_content}
        """
    question = llm.invoke(prompt_q).strip()
    #print("Generated question:", question)

    # --- Step 2: Generate the answer ---
    prompt_a = f"""
        You are a financial analyst AI. Based on the following context, generate a short, clear answer to the question. 
        - If the answer is a fact, provide the fact. 
        - Do not output letters or placeholders. 
        - Answer in a complete sentence using information from the context.
        - Provide a concise answer in one or two sentences. 
        - Do not include headings, page numbers, or unrelated text.

        Context:
        {doc.page_content}

        Question:
        {question}

        Answer:
        """
    answer = llm.invoke(prompt_a).strip()
    #print("Generated answer:", answer)

    # Append QA pair
    qa_pairs.append({
        "question": question,
        "answer": answer,
        "source": doc.page_content[:100]  # first 100 chars as reference
    })

print(f"Generated {len(qa_pairs)} QA pairs:")
for pair in qa_pairs:
    print(pair)



Generated 20 QA pairs:
{'question': 'What is the main idea of the report?', 'answer': 'price and financial stability for sustainable growth annual report 2024/25 1 introduction sarb overview 2 about this report 4 price and financial stability for', 'source': 'annual report 2024/25\nprice and financial stability \nfor sustainable growth\n annual report 2024/25 1'}
{'question': 'What is the main idea of the passage?', 'answer': 'price and financial stability for sustainable growth what the sarb does', 'source': 'price and financial stability for \nsustainable growth\nwhat the sarb does 6\nabout price and financial'}
{'question': 'What is the governor’s message?', 'answer': 'connecting past successes to future opportunities', 'source': 'governor’s message 14 \nconnecting past successes to future opportunities\n 18 \n \nhow the sarb is gove'}
{'question': 'What is the main idea of this passage?', 'answer': 'risk management the sarb’s performance monetary policy under high uncertainty 46 re