EXTRACTING TEXT FROM PDF FILES

In [5]:
# batch_pdf_to_text_pypdf.py
from pypdf import PdfReader
import os

# Input and output directories
input_dir = "../data/raw/SARB"
output_dir = "../data/text/"
os.makedirs(output_dir, exist_ok=True)

# Loop over all PDFs in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_dir, filename)
        text_path = os.path.join(output_dir, filename.replace(".pdf", ".txt"))

        all_text = ""
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  # skip pages with no extractable text
                all_text += page_text + "\n"

        # Write extracted text to a .txt file
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(all_text)

        print(f"Extracted {filename}")


Extracted Second Edition 2024 Financial Stability Review_Final_.pdf
Extracted sarb-2024-25.pdf
Extracted MPROCT2024INTERNET.pdf
Extracted Monetary Policy Review April 2025.pdf
Extracted First Edition 2025 Financial Stability Review_1.pdf
Extracted SARB Annual Financial Statements 2023-24.pdf
Extracted Tax chronology 2025 Final.pdf


CLEANING TEXT

In [7]:
import os
import re

# --- Cleaning & normalization function ---
def clean_and_normalize(text):
    # Lowercase
    text = text.lower()
    
    # Remove headers/footers/page numbers
    text = re.sub(r'page \d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'sarb.*report', '', text, flags=re.IGNORECASE)  # adjust pattern to your reports
    
    # Remove copyright and boilerplate
    text = re.sub(r'[©®]', '', text)
    text = re.sub(r'south african reserve bank', '', text, flags=re.IGNORECASE)
    text = re.sub(r'all rights reserved', '', text, flags=re.IGNORECASE)
    
    # Remove multiple newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[ ]+', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

# --- Batch processing ---
input_dir = "../data/text/"
output_dir = "../data/clean/"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(input_dir, filename), "r", encoding="utf-8") as f:
            text = f.read()
        
        cleaned_text = clean_and_normalize(text)

        # Save cleaned text
        with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f:
            f.write(cleaned_text)  # write tokenized version as a single string
        
        print(f"Cleaned {filename}")


Cleaned MPROCT2024INTERNET.txt
Cleaned Second Edition 2024 Financial Stability Review_Final_.txt
Cleaned sarb-2024-25.txt
Cleaned Monetary Policy Review April 2025.txt
Cleaned Tax chronology 2025 Final.txt
Cleaned SARB Annual Financial Statements 2023-24.txt
Cleaned First Edition 2025 Financial Stability Review_1.txt
