In [None]:
import pdfplumber
import re

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    footer_text_1 = "*This is a system generated email. Please do not reply to this email. For further enquiry, kindly contact our customer service through https://tngd.my/careline-webform or call us at"
    footer_text_2 = "+603 5022 3888. The operating hours are Monday to Sunday, 7.00am to 10.00pm (including public holidays). Thank you."
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            # Remove the repeated footer text
            if footer_text_1 in page_text:
                page_text = page_text.split(footer_text_1)[0]
            if footer_text_2 in page_text:
                page_text = page_text.split(footer_text_2)[0]
            text += page_text + "\n"
    return text

# Step 2: Extract and parse the lines
def parse_lines(lines):
    transaction_types = ["DuitNow QR TNGD", "DuitNow QR", "Reload", "eWallet Cash Out", "DUITNOW_RECEI", "Transfer to Wallet","Payment"]
    
    for i in range(8, min(48, len(lines))):  # Process lines 9 to 48 (40 lines)
        line = lines[i]
        
        # Split the line to extract the date and status
        parts = line.split(maxsplit=2)
        if len(parts) < 3:
            continue
        date = parts[0]
        status = parts[1]
        remaining_line = parts[2]
        
        # Identify and remove the transaction type from the line
        transaction_type = next((tt for tt in transaction_types if tt in remaining_line), "Unknown")
        remaining_line = remaining_line.replace(transaction_type, "").strip()
        
        # Handle specific cases for "Reload" and "DUITNOW_RECEI"
        if transaction_type == "Reload":
            remaining_line = remaining_line.replace("Quick Reload Payment (via GO+", "Quick Reload Payment (via GO+Balance")
            if i + 1 < len(lines) and "Balance)" in lines[i + 1]:
                lines[i + 1] = lines[i + 1].replace("Balance)", "").strip()
        elif transaction_type == "DUITNOW_RECEI":
            transaction_type = "DUITNOW_RECEIVEFROM"
            remaining_line = remaining_line.replace("DUITNOW_RECEI", "DUITNOW_RECEIVEFROM")
            if i + 1 < len(lines) and "VEFROM" in lines[i + 1]:
                lines[i + 1] = lines[i + 1].replace("VEFROM", "").strip()
        
        # Print the remaining line for debugging
        print(f"Remaining line after removing transaction type '{transaction_type}': {remaining_line}")
        
        # Initialize the reference with the first part
        reference = remaining_line.split()[0]
        
        # Keep reading lines until we reach the next date
        j = i + 1
        while j < len(lines) and not re.match(r'\d{1,2}/\d{1,2}/\d{4}', lines[j]):
            reference += " " + lines[j].strip()
            j += 1
        
        # Split the remaining part of the line to extract the description, details, amount, and wallet balance
        parts = remaining_line.split()
        wallet_balance = parts[-1]
        amount = parts[-2]
        details = parts[-3]
        description = " ".join(parts[1:-3])
        
        # Print the parsed parts
        print(f"Date: {date}")
        print(f"Status: {status}")
        print(f"Transaction Type: {transaction_type}")
        print(f"Reference: {reference}")
        print(f"Description: {description}")
        print(f"Details: {details}")
        print(f"Amount (RM): {amount}")
        print(f"Wallet Balance: {wallet_balance}")

# Main function to execute the steps
def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    lines = text.split('\n')
    
    # Print the first 40 lines for debugging
    for i in range(min(40, len(lines))):
        print(f"Line {i+1}: {lines[i]}")
    
    # Parse the lines
    parse_lines(lines)

# Example usage
pdf_path = r'Your tng excel'
main(pdf_path)