In [8]:
import PyPDF2
import re

In [9]:
def extract_financial_info(pdf_filename):
    """
    Extracts key financial information from the provided PDF file.

    Args:
        pdf_filename: The name of the PDF file to process.

    Returns:
        A dictionary containing the extracted financial information.
    """

    # Open the PDF file
    pdf_file = open(pdf_filename, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Initialize an empty string to store the extracted text
    extracted_text = ""

    # Iterate over all the pages and extract the text
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        extracted_text += page.extract_text()

    # Close the PDF file
    pdf_file.close()

    # Use regular expressions to find the desired information
    financial_info = {}

    # --- Basic Information --- (with error handling)
    match = re.search(r"Company Name:\s*(.*)", extracted_text)
    financial_info["Company Name"] = match.group(1) if match else None

    match = re.search(r"Registration No:\s*(.*)", extracted_text)
    financial_info["Registration No"] = match.group(1) if match else None

    match = re.search(r"Start Fin Date:\s*(.*)", extracted_text)
    financial_info["Start Fin Date"] = match.group(1) if match else None

    match = re.search(r"End Fin Date:\s*(.*)", extracted_text)
    financial_info["End Fin Date"] = match.group(1) if match else None

    match = re.search(r"Group/Company:\s*(.*)", extracted_text)
    financial_info["Group/Company"] = match.group(1) if match else None

    match = re.search(r"Auditor Comment:\s*(.*)", extracted_text)
    financial_info["Auditor Comment"] = match.group(1) if match else None

    match = re.search(r"Reporting Standard\s*:\s*(.*)", extracted_text)
    financial_info["Reporting Standard"] = match.group(1) if match else None

    # --- Profit & Loss ---
    match = re.search(r"Revenue\s*([\d,\.]+)", extracted_text)
    financial_info["Revenue"] = match.group(1) if match else None

    match = re.search(r"Other Turnover\s*([\d,\.]+)", extracted_text)
    financial_info["Other Turnover"] = match.group(1) if match else None

    match = re.search(r"Sales of Goods\s*([\d,\.]+)", extracted_text)
    financial_info["Sales of Goods"] = match.group(1) if match else None

    match = re.search(r"Sales of Services\s*([\d,\.]+)", extracted_text)
    financial_info["Sales of Services"] = match.group(1) if match else None

    match = re.search(r"Sales of Goods & Services\s*([\d,\.]+)", extracted_text)
    financial_info["Sales of Goods & Services"] = match.group(1) if match else None

    match = re.search(r"Interest Revenue\s*([\d,\.]+)", extracted_text)
    financial_info["Interest Revenue"] = match.group(1) if match else None

    match = re.search(r"Fee And Commission Revenue\s*([\d,\.]+)", extracted_text)
    financial_info["Fee And Commission Revenue"] = match.group(1) if match else None

    match = re.search(r"Dividend Revenue\s*([\d,\.]+)", extracted_text)
    financial_info["Dividend Revenue"] = match.group(1) if match else None

    match = re.search(r"Gross Insurance/Reinsurance Premium\s*([\d,\.]+)", extracted_text)
    financial_info["Gross Insurance/Reinsurance Premium"] = match.group(1) if match else None

    match = re.search(r"Sales of Development Properties\s*([\d,\.]+)", extracted_text)
    financial_info["Sales of Development Properties"] = match.group(1) if match else None

    match = re.search(r"Service and Maintenance Income\s*([\d,\.]+)", extracted_text)
    financial_info["Service and Maintenance Income"] = match.group(1) if match else None

    match = re.search(r"Revenue Adjustment\s*([\d,\.]+)", extracted_text)
    financial_info["Revenue Adjustment"] = match.group(1) if match else None

    match = re.search(r"Cost Of Sales\s*([\d,\.]+)", extracted_text)
    financial_info["Cost Of Sales"] = match.group(1) if match else None

    match = re.search(r"Gross Profit\s*([\d,\.]+)", extracted_text)
    financial_info["Gross Profit"] = match.group(1) if match else None

    match = re.search(r"Other Items of Expense\s*\(([\d,\.]+)\)", extracted_text)
    financial_info["Other Items of Expense"] = match.group(1) if match else None

    match = re.search(r"Finance Cost\s*\(([\d,\.]+)\)", extracted_text)
    financial_info["Finance Cost"] = match.group(1) if match else None

    match = re.search(r"Depreciation Expense\s*\(([\d,\.]+)\)", extracted_text)
    financial_info["Depreciation Expense"] = match.group(1) if match else None

    match = re.search(r"Amortization Expense\s*([\d,\.]+)", extracted_text)
    financial_info["Amortization Expense"] = match.group(1) if match else None

    match = re.search(r"Employee Benefits Expense\s*\(([\d,\.]+)\)", extracted_text)
    financial_info["Employee Benefits Expense"] = match.group(1) if match else None

    # ... (rest of the regular expressions for other fields) ...

    return financial_info

In [10]:
# --- Example usage ---
pdf_filename = "sample-output-company-cepat-kaya.pdf"  # Or get this from user input
financial_data = extract_financial_info(pdf_filename)

# --- Print or further process the extracted data ---
for key, value in financial_data.items():
    print(f"{key}: {value}")

Company Name: None
Registration No: None
Start Fin Date: None
End Fin Date: None
Group/Company: None
Auditor Comment: None
Reporting Standard: FRS  Proﬁt & Loss Revenue 200,884,247.00 Other Turnover 0.00 Sales of Goods 0.00 Sales of Services 0.00 Sales of Goods & Services 1,884,247.00 Interest Revenue 0.00 Fee And Commission Revenue 0.00 Dividend Revenue 0.00 Gross Insurance/Reinsurance Premium 0.00 Sales of Development Properties 0.00 Service and Maintenance Income 0.00 Revenue Adjustment 0.00 Cost Of Sales 0.00 Gross Proﬁt 0.00 Other Items of Expense (929,327.00) Finance Cost (150,835.00) Depreciation Expense (102,272.00) Amortization Expense 0.00 Employee Beneﬁts Expense (83,066.00) Director Remuneration 0.00 Operating Lease Expenses (21,768.00) Professional Legal Expenses 0.00 Management Fees 0.00 Repair and Maintenance Expense 0.00 Operating Proﬁt (32,245.00) Other Items of Income 56,566.00 Interest Income 0.00 Fee and Commission Income 0.00 Other Operating Income 56,927.00 Divide