In [None]:
import easyocr
from pdf2image import convert_from_path
import numpy as np
import pandas as pd
import re
import os

# Step 1: Convert PDF to image
pdf_path = r"C:\Users\HP\kreativetimebox-task\CUSTOM LLM\paddleOcr\Barclays_uk_bank_statement.pdf"
output_excel = os.path.expanduser("~/Documents/output_table_final_cleaned_fixed.xlsx")
dpi_setting = 300

images = convert_from_path(pdf_path, dpi=dpi_setting)
image = np.array(images[0])

# Step 2: Run EasyOCR
reader = easyocr.Reader(['en'], gpu=False)
results = reader.readtext(image)

# Step 3: Group results by Y position
rows = {}
for bbox, text, _ in results:
    y_center = sum([pt[1] for pt in bbox]) / 4
    y_key = round(y_center, -1)
    x_center = sum([pt[0] for pt in bbox]) / 4
    clean_text = text.strip()

    if clean_text:
        rows.setdefault(y_key, []).append((x_center, clean_text))

# Step 4: Sort and structure rows
structured_data = []

for y in sorted(rows.keys()):
    line = sorted(rows[y], key=lambda x: x[0])
    text_line = [text for _, text in line]

    # Normalize symbols (E or { to £)
    text_line = [re.sub(r"^[\{\[Ee]", "£", t) for t in text_line]

    structured_data.append(text_line)

# Step 5: Merge broken rows and fix column alignment
final_data = []
temp_row = []

for row in structured_data:
    # Check if first value looks like a date
    if len(row) >= 2 and re.match(r"\d{2}/\d{2}/\d{4}", row[0]):
        if temp_row:
            final_data.append(temp_row)
        temp_row = row
    else:
        # Append continuation lines (broken rows like RESTAURANT BILL)
        temp_row += row

if temp_row:
    final_data.append(temp_row)

# Step 6: Extract clean columns from each row
cleaned_data = []

for row in final_data:
    # Initialize fields
    date = ""
    description = ""
    money_in = ""
    money_out = ""
    balance = ""

    for t in row:
        if re.match(r"\d{2}/\d{2}/\d{4}", t):
            date = t
        elif re.match(r"^£\d", t):
            balance = t
        elif re.match(r"^\d+(\.\d+)?$", t):
            if not money_out:
                money_out = t
            elif not money_in:
                money_in = t
        else:
            description += t + " "

    if date and balance:
        cleaned_data.append([date, description.strip(), money_in, money_out, balance])

# Step 7: Export to Excel
df = pd.DataFrame(cleaned_data, columns=["Date", "Description", "Money In", "Money Out", "Balance"])
df.to_excel(output_excel, index=False)

print(f"✅ Clean table saved to Excel: {output_excel}")


Using CPU. Note: This module is much faster with a GPU.


✅ Clean table saved to Excel: C:\Users\HP/Documents/output_table_final_cleaned_fixed.xlsx
