In [85]:
import pdfplumber
import pandas as pd
import re

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    footer_text_1 = "*This is a system generated email. Please do not reply to this email. For further enquiry, kindly contact our customer service through https://tngd.my/careline-webform or call us at"
    footer_text_2 ="+603 5022 3888. The operating hours are Monday to Sunday, 7.00am to 10.00pm (including public holidays). Thank you."
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            # Remove the repeated footer text
            if footer_text in page_text:
                page_text = page_text.split(footer_text_1)[0]
                page_text = page_text.split(footer_text_2)[0]
            text += page_text + "\n"
    return text

# Step 2: Extract lines 2 to 6 as information
def extract_information(lines):
    info = [
        ["Registered Name", lines[1].split(maxsplit=2)[2].strip()],
        ["Wallet ID", lines[2].split(maxsplit=2)[2].strip()],
        ["Account Status", lines[3].split(maxsplit=2)[2].strip()],
        ["Generated Date & Time", lines[4].split(maxsplit=3)[3].strip()],
        ["Transaction Period", lines[5].split(maxsplit=2)[2].strip()]
    ]
    return info

# Step 3: Parse the transaction data starting from the 9th line
def parse_transactions(lines):
    header = ["Date", "Status", "Transaction Type", "Reference", "Description", "Details", "Amount (RM)", "Wallet Balance"]
    transactions = []
    i = 8  # Start processing from the 9th line

    while i < len(lines):
        line = lines[i]
        if re.match(r'\d{1,2}/\d{1,2}/\d{4}', line):  # Check if the line starts with a date
            # Split the line to extract the date and status
            parts = line.split(maxsplit=2)
            date = parts[0]
            status = parts[1]
            remaining_line = parts[2]
            
            # Identify and remove the transaction type from the line
            transaction_types = ["DuitNow QR", "Reload", "eWallet Cash Out", "DUITNOW_RECEIVEFROM", "DuitNow QR TNGD", "Transfer to Wallet"]
            transaction_type = next((tt for tt in transaction_types if tt in remaining_line), "Unknown")
            remaining_line = remaining_line.replace(transaction_type, "").strip()
            
            # Initialize reference and description
            reference = ""
            description = ""
            
            # Continue reading lines until we encounter a line that starts with a date or reach the end
            while i < len(lines) and not re.match(r'\d{1,2}/\d{1,2}/\d{4}', lines[i]):
                if reference == "":
                    reference = remaining_line.split(maxsplit=1)[0]
                    remaining_line = remaining_line.split(maxsplit=1)[1]
                else:
                    remaining_line = lines[i]
                i += 1
            
            # Split the remaining part of the line to extract the description, details, amount, and wallet balance
            parts = remaining_line.split()
            wallet_balance = parts[-1]
            amount = parts[-2]
            details = parts[-3]
            description = " ".join(parts[1:-3])
            
            # Append the parsed transaction to the list
            transactions.append([date, status, transaction_type, reference, description, details, amount, wallet_balance])
        else:
            i += 1

    transactions_df = pd.DataFrame(transactions, columns=header)
    return transactions_df

# Step 4: Combine the parsed data into a single DataFrame
def create_combined_dataframe(info, transactions_df):
    info_df = pd.DataFrame(info, columns=["Information", "Details"])
    return info_df, transactions_df

# Step 5: Save the combined data to an Excel file with multiple sheets
def save_to_excel(info_df, transactions_df, excel_path):
    with pd.ExcelWriter(excel_path) as writer:
        info_df.to_excel(writer, sheet_name='Header Information', index=False)
        transactions_df.to_excel(writer, sheet_name='Transactions', index=False)

# Main function to execute the steps
def main(pdf_path, excel_path):
    text = extract_text_from_pdf(pdf_path)
    lines = text.split('\n')
    info = extract_information(lines)
    transactions_df = parse_transactions(lines)
    info_df, transactions_df = create_combined_dataframe(info, transactions_df)
    save_to_excel(info_df, transactions_df, excel_path)

    # Debug prints
    print("Header Information:")
    print(info_df)
    print("\nTransactions:")
    print(transactions_df)

# Example usage
pdf_path = r'C:\Users\lewka\Downloads\tng_ewallet_transactions.pdf'
excel_path = r'C:\Users\lewka\Downloads\transaction_history.xlsx'
main(pdf_path, excel_path)


KeyboardInterrupt



In [87]:
import pdfplumber

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    footer_text = "*This is a system generated email. Please do not reply to this email. For further enquiry, kindly contact our customer service through https://tngd.my/careline-webform or call us at +603 5022 3888. The operating hours are Monday to Sunday, 7.00am to 10.00pm (including public holidays). Thank you."
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            # Remove the repeated footer text
            if footer_text in page_text:
                page_text = page_text.split(footer_text)[0]
            text += page_text + "\n"
    return text

# Main function to execute the steps
def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    lines = text.split('\n')
    
    # Print the first 80 lines for debugging
    print("First 80 lines extracted from the PDF:")
    for i, line in enumerate(lines[:80]):
        print(f"Line {i+1}: {line}")

# Example usage
pdf_path = r'C:\Users\lewka\Downloads\tng_ewallet_transactions.pdf'
main(pdf_path)

First 80 lines extracted from the PDF:
Line 1: TNG WALLET TRANSACTION HISTORY
Line 2: Registered Name LEW KAI LIANG
Line 3: Wallet ID 1000001535689604
Line 4: Account Status Active
Line 5: Generated Date & Time 22 September 2024 9:27 AM
Line 6: Transaction Period 25 June 2024 - 22 September 2024
Line 7: TNG WALLET TRANSACTION
Line 8: Date Status Transaction Type Reference Description Details Amount (RM) Wallet Balance
Line 9: 20/9/2024 Success DuitNow QR 20240920101 KANG KANG ENTERPRISE 202409202112128001001716066639 RM3.00 RM0.00
Line 10: 10000010000 09144
Line 11: TNGOW3MY1
Line 12: 71608929442
Line 13: 143
Line 14: 20/9/2024 Success Reload 20240920101 Quick Reload Payment (via GO+ 202409201310030335999776400600 RM3.00 RM3.00
Line 15: 10000010000 Balance)
Line 16: TNGOW3MY1
Line 17: 71608929442
Line 18: 142
Line 19: 20/9/2024 Success DuitNow QR 20240920101 FITWHEY GYM 4 202409202112128001001716080639 RM6.00 RM0.00
Line 20: 10000010000 06013
Line 21: TNGOW3MY1
Line 22: 71608929427
Lin

In [7]:
import pdfplumber

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    footer_text_1 = "*This is a system generated email. Please do not reply to this email. For further enquiry, kindly contact our customer service through https://tngd.my/careline-webform or call us at"
    footer_text_2 ="+603 5022 3888. The operating hours are Monday to Sunday, 7.00am to 10.00pm (including public holidays). Thank you."
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            # Remove the repeated footer text
            if footer_text in page_text:
                page_text = page_text.split(footer_text)[0]
            text += page_text + "\n"
    return text

# Step 2: Extract and parse the 9th line
def parse_ninth_line(lines):
    line = lines[8]  # The 9th line (index 8)
    
    # Split the line to extract the date and status
    parts = line.split(maxsplit=2)
    date = parts[0]
    status = parts[1]
    remaining_line = parts[2]
    
    # Identify and remove the transaction type from the line
    transaction_types = ["DuitNow QR", "Reload", "eWallet Cash Out", "DUITNOW_RECEIVEFROM", "DuitNow QR TNGD", "Transfer to Wallet"]
    transaction_type = next((tt for tt in transaction_types if tt in remaining_line), "Unknown")
    remaining_line = remaining_line.replace(transaction_type, "").strip()
    
    # Print the remaining line for debugging
    print(f"Remaining line after removing transaction type '{transaction_type}': {remaining_line}")
    
    # Split the remaining part of the line to extract the reference, description, details, amount, and wallet balance
    parts = remaining_line.split()
    reference = parts[0]
    wallet_balance = parts[-1]
    amount = parts[-2]
    details = parts[-3]
    description = " ".join(parts[1:-3])
    
    # Print the parsed parts
    print(f"Date: {date}")
    print(f"Status: {status}")
    print(f"Transaction Type: {transaction_type}")
    print(f"Reference: {reference}")
    print(f"Description: {description}")
    print(f"Details: {details}")
    print(f"Amount (RM): {amount}")
    print(f"Wallet Balance: {wallet_balance}")

# Main function to execute the steps
def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    lines = text.split('\n')
    
    # Print the 9th line for debugging
    print("9th line of the PDF:")
    print(lines[8])
    
    # Parse the 9th line
    parse_ninth_line(lines)

# Example usage
pdf_path = r'C:\Users\lewka\Downloads\tng_ewallet_transactions.pdf'
main(pdf_path)

9th line of the PDF:
20/9/2024 Success DuitNow QR 20240920101 KANG KANG ENTERPRISE 202409202112128001001716066639 RM3.00 RM0.00
Remaining line after removing transaction type 'DuitNow QR': 20240920101 KANG KANG ENTERPRISE 202409202112128001001716066639 RM3.00 RM0.00
Date: 20/9/2024
Status: Success
Transaction Type: DuitNow QR
Reference: 20240920101
Description: KANG KANG ENTERPRISE
Details: 202409202112128001001716066639
Amount (RM): RM3.00
Wallet Balance: RM0.00


In [8]:
import pdfplumber
import re

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    footer_text = "*This is a system generated email. Please do not reply to this email. For further enquiry, kindly contact our customer service through https://tngd.my/careline-webform or call us at +603 5022 3888. The operating hours are Monday to Sunday, 7.00am to 10.00pm (including public holidays). Thank you."
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            # Remove the repeated footer text
            if footer_text in page_text:
                page_text = page_text.split(footer_text_1)[0]
                page_text = page_text.split(footer_text_2)[0]
            text += page_text + "\n"
    return text

# Step 2: Extract and parse the 9th line
def parse_ninth_line(lines):
    line = lines[8]  # The 9th line (index 8)
    
    # Split the line to extract the date and status
    parts = line.split(maxsplit=2)
    date = parts[0]
    status = parts[1]
    remaining_line = parts[2]
    
    # Identify and remove the transaction type from the line
    transaction_types = ["DuitNow QR", "Reload", "eWallet Cash Out", "DUITNOW_RECEIVEFROM", "DuitNow QR TNGD", "Transfer to Wallet"]
    transaction_type = next((tt for tt in transaction_types if tt in remaining_line), "Unknown")
    remaining_line = remaining_line.replace(transaction_type, "").strip()
    
    # Initialize the reference with the first part
    reference = remaining_line.split()[0]
    
    # Keep reading lines until we reach the next date
    i = 9
    while i < len(lines) and not re.match(r'\d{1,2}/\d{1,2}/\d{4}', lines[i]):
        reference += " " + lines[i].strip()
        i += 1
    
    # Print the remaining line for debugging
    print(f"Remaining line after removing transaction type '{transaction_type}': {remaining_line}")
    
    # Split the remaining part of the line to extract the description, details, amount, and wallet balance
    parts = remaining_line.split()
    wallet_balance = parts[-1]
    amount = parts[-2]
    details = parts[-3]
    description = " ".join(parts[1:-3])
    
    # Print the parsed parts
    print(f"Date: {date}")
    print(f"Status: {status}")
    print(f"Transaction Type: {transaction_type}")
    print(f"Reference: {reference}")
    print(f"Description: {description}")
    print(f"Details: {details}")
    print(f"Amount (RM): {amount}")
    print(f"Wallet Balance: {wallet_balance}")

# Main function to execute the steps
def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    lines = text.split('\n')
    
    # Print the 9th line for debugging
    print("9th line of the PDF:")
    print(lines[8])
    
    # Parse the 9th line
    parse_ninth_line(lines)

# Example usage
pdf_path = r'C:\Users\lewka\Downloads\tng_ewallet_transactions.pdf'
main(pdf_path)

9th line of the PDF:
20/9/2024 Success DuitNow QR 20240920101 KANG KANG ENTERPRISE 202409202112128001001716066639 RM3.00 RM0.00
Remaining line after removing transaction type 'DuitNow QR': 20240920101 KANG KANG ENTERPRISE 202409202112128001001716066639 RM3.00 RM0.00
Date: 20/9/2024
Status: Success
Transaction Type: DuitNow QR
Reference: 20240920101 10000010000 09144 TNGOW3MY1 71608929442 143
Description: KANG KANG ENTERPRISE
Details: 202409202112128001001716066639
Amount (RM): RM3.00
Wallet Balance: RM0.00


In [None]:
import pdfplumber
import re

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    text = ""
    footer_text_1 = "*This is a system generated email. Please do not reply to this email. For further enquiry, kindly contact our customer service through https://tngd.my/careline-webform or call us at"
    footer_text_2 = "+603 5022 3888. The operating hours are Monday to Sunday, 7.00am to 10.00pm (including public holidays). Thank you."
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            # Remove the repeated footer text
            if footer_text_1 in page_text:
                page_text = page_text.split(footer_text_1)[0]
            if footer_text_2 in page_text:
                page_text = page_text.split(footer_text_2)[0]
            text += page_text + "\n"
    return text

# Step 2: Extract and parse the lines
def parse_lines(lines):
    transaction_types = ["DuitNow QR TNGD", "DuitNow QR", "Reload", "eWallet Cash Out", "DUITNOW_RECEI", "Transfer to Wallet","Payment"]
    
    for i in range(8, min(48, len(lines))):  # Process lines 9 to 48 (40 lines)
        line = lines[i]
        
        # Split the line to extract the date and status
        parts = line.split(maxsplit=2)
        if len(parts) < 3:
            continue
        date = parts[0]
        status = parts[1]
        remaining_line = parts[2]
        
        # Identify and remove the transaction type from the line
        transaction_type = next((tt for tt in transaction_types if tt in remaining_line), "Unknown")
        remaining_line = remaining_line.replace(transaction_type, "").strip()
        
        # Handle specific cases for "Reload" and "DUITNOW_RECEI"
        if transaction_type == "Reload":
            remaining_line = remaining_line.replace("Quick Reload Payment (via GO+", "Quick Reload Payment (via GO+Balance")
            if i + 1 < len(lines) and "Balance)" in lines[i + 1]:
                lines[i + 1] = lines[i + 1].replace("Balance)", "").strip()
        elif transaction_type == "DUITNOW_RECEI":
            transaction_type = "DUITNOW_RECEIVEFROM"
            remaining_line = remaining_line.replace("DUITNOW_RECEI", "DUITNOW_RECEIVEFROM")
            if i + 1 < len(lines) and "VEFROM" in lines[i + 1]:
                lines[i + 1] = lines[i + 1].replace("VEFROM", "").strip()
        
        # Print the remaining line for debugging
        print(f"Remaining line after removing transaction type '{transaction_type}': {remaining_line}")
        
        # Initialize the reference with the first part
        reference = remaining_line.split()[0]
        
        # Keep reading lines until we reach the next date
        j = i + 1
        while j < len(lines) and not re.match(r'\d{1,2}/\d{1,2}/\d{4}', lines[j]):
            reference += " " + lines[j].strip()
            j += 1
        
        # Split the remaining part of the line to extract the description, details, amount, and wallet balance
        parts = remaining_line.split()
        wallet_balance = parts[-1]
        amount = parts[-2]
        details = parts[-3]
        description = " ".join(parts[1:-3])
        
        # Print the parsed parts
        print(f"Date: {date}")
        print(f"Status: {status}")
        print(f"Transaction Type: {transaction_type}")
        print(f"Reference: {reference}")
        print(f"Description: {description}")
        print(f"Details: {details}")
        print(f"Amount (RM): {amount}")
        print(f"Wallet Balance: {wallet_balance}")

# Main function to execute the steps
def main(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    lines = text.split('\n')
    
    # Print the first 40 lines for debugging
    for i in range(min(40, len(lines))):
        print(f"Line {i+1}: {lines[i]}")
    
    # Parse the lines
    parse_lines(lines)

# Example usage
pdf_path = r'Your tng excel'
main(pdf_path)