In [27]:
import PyPDF2
import re

def extract_key_value_pairs(filename):
    """
    Extracts key-value pairs from the sample output PDF, ensuring 
    each key-value pair is on a separate line.
    """
    key_value_pairs = {}
    with open(filename, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text = page.extract_text()
            # Split the text using a more flexible regex
            items = re.findall(r'([a-zA-ZÀ-ÿ\s\(\)]+)\s+([0-9,\(\).]+)', text)  
            for key, value in items:
                key_value_pairs[key.strip()] = value.strip()
    return key_value_pairs

def extract_text_from_pdf(filename):
    """
    Extracts all text content from a PDF file.
    """
    text = ""
    with open(filename, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_financial_data(format_data, input_text):
    """
    Extracts financial data from the input text based on the provided format.
    """
    output_data = format_data.copy()

    for key in output_data:
        # More robust regex to handle variations in formatting
        pattern = rf"({key})\s*[\w\s:-]*([\d,.]+)\b"
        matches = re.findall(pattern, input_text, re.IGNORECASE)

        if matches:
            # Handle potential multiple matches (take the last one for now)
            value = matches[-1][1]
            # Remove commas and convert to numeric if possible
            try:
                value = float(value.replace(",", ""))
            except ValueError:
                pass
            output_data[key] = value
    return output_data

def write_output_to_txt(data, filename):
    """
    Writes the extracted data to a text file in a nicely formatted list.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for key, value in data.items():
            try:
                # Nicer formatting with aligned values
                f.write(f"{key:<50} {value:>15}\n")  
            except UnicodeEncodeError:
                # Replace or remove problematic characters if necessary
                encoded_value = value.encode('utf-8', errors='replace').decode('utf-8')
                f.write(f"{key:<50} {encoded_value:>15}\n")

# --- Main execution ---
if __name__ == "__main__":
    format_file = "sample-output-company-cepat-kaya.pdf"
    input_file = "maxis-2023.pdf"
    output_file = "financial_report_maxis_2023.txt"

    format_data = extract_key_value_pairs(format_file)
    input_text = extract_text_from_pdf(input_file)
    extracted_data = extract_financial_data(format_data, input_text)
    write_output_to_txt(extracted_data, output_file)

    print(f"Financial report generated and saved to {output_file}")

Financial report generated and saved to financial_report_maxis_2023.txt
