In [2]:
import cv2
import numpy as np
import fitz  # PyMuPDF
import pytesseract
import os

In [3]:
def convert_pdf_to_images(pdf_path, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    pdf_document = fitz.open(pdf_path)  # Open the PDF document
    image_paths = []
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]  # Get base name of the file without extension
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)  # Load each page
        pix = page.get_pixmap()  # Get the pixmap (image representation)
        img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        
        if pix.n > 3:  # Convert RGBA to RGB if needed
            img_data = cv2.cvtColor(img_data, cv2.COLOR_BGRA2BGR)
        
        # Save the image with the processed prefix
        image_path = os.path.join(output_folder, f"Processed_{base_name}_page_{page_num + 1}.png")
        cv2.imwrite(image_path, img_data)
        image_paths.append(image_path)  # Keep track of saved image paths
    return image_paths

def preprocess_image(image_path):
    image = cv2.imread(image_path)  # Read the image
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
    blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)  # Apply Gaussian blur to reduce noise
    denoised_image = cv2.fastNlMeansDenoising(blurred_image, h=30)  # Denoise the image
    adaptive_thresh_image = cv2.adaptiveThreshold(denoised_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)  # Adaptive thresholding
    kernel = np.ones((3, 3), np.uint8)  # Kernel for morphological operations
    morph_image = cv2.morphologyEx(adaptive_thresh_image, cv2.MORPH_CLOSE, kernel)  # Morphological closing
    contrast_image = cv2.convertScaleAbs(morph_image, alpha=1.5, beta=0)  # Adjust contrast
    return contrast_image

def extract_text_from_images(image_paths):
    texts = []
    for image_path in image_paths:
        img = cv2.imread(image_path)  # Read the image
        text = pytesseract.image_to_string(img, config='--psm 1')  # Extract text using Tesseract OCR
        texts.append(text)
    return texts

def extract_text_from_file(file_path, output_folder="Processed_images_PDFs"):
    if file_path.lower().endswith('.pdf'):
        image_paths = convert_pdf_to_images(file_path, output_folder)  # Convert PDF to images
        texts = extract_text_from_images(image_paths)  # Extract text from images without preprocessing
    else:
        processed_image = preprocess_image(file_path)  # Preprocess the image
        base_name = os.path.splitext(os.path.basename(file_path))[0]  # Get base name of the file without extension
        processed_image_path = os.path.join(output_folder, f"Processed_{base_name}.png")
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        cv2.imwrite(processed_image_path, processed_image)  # Save the processed image
        texts = [pytesseract.image_to_string(processed_image, config='--psm 1')]  # Extract text from the processed image
    return "\n".join(texts)  # Join extracted texts from all images

In [2]:
# pip install pdfplumber

In [3]:
# test pdf
import pdfplumber

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text




In [5]:
# Extracting text from each PDF
receipt_text = extract_text_from_pdf("uploaded-receipts\Invoice-12-19-2020-328814-1_WCCIJbS.pdf")
print(receipt_text)

Invoice
CAN US EXPERT TEK SOLUTIONS
Contact Number :+8558342055
1300 W Holly St STE 206
Bill Date : 12/19/2020
Bellingham
Period : 11/19/2020 - 12/18/2020
-98225
Invoice No : C328814-2
US
Account Summary
Current Activity
Current Bill Amount(USD)
Fixed Charges - Line2 - Business Monthly (USD) 24.99
Usage Charges(USD) 0.00
Current Bill Total(USD) 24.99
Net Charges 24.99
Other Charges(USD)
Other Charges Total(USD) 0.00
Total Taxes, Charges & Fees(USD) 7.32
Current Amount(USD) 32.31
Payment(s) Received(USD) 32.31
Total Amount Due(USD) 0.00
Taxes, Charges, & Fees
FCC Regulatory Fee (Federal) 0.02
FUSF (Federal) 1.90
Sales Tax (State) 1.93
Sales Tax (Local) 0.66
Utility Users Tax (Local) 0.81
Compliance & Administrative Cost Recovery Fee 2.00
Total Taxes, Charges, & Fees(USD) 7.32
Billing Summary for Company - CAN US EXPERT TEK SOLUTIONS
Description of Charge
Fixed Charges
Company 24.99
Other Recurring Charges
Total Charges (USD): 24.99
Page 1 of 1
v1.0


In [4]:
if __name__ == "__main__":
    file_path = r"uploaded-receipts\IMG_1305_9yP7EuA.jpg"  # Replace with your file path (PDF or image)
    extracted_text = extract_text_from_file(file_path)  # Extract text from the file
    print(extracted_text)  # Print the extracted text
    
    

TASTE of ON RECORD ------
PUNJAB RESTAURANT
SURREY BC

Purchase

Jul 14,2022 19:40:20
VISA Khe KEKKHEKEK YD 7 0
Entry: Tap EMV (H)

Ref#: 418-1D21 95708209106

Auth#: 006227 Response: 01-827
Order: MGO1657852810066

Username: harj8477
Amount - § 25.20

A0000000031010 Visa Credit
TVR 0000000000

Approved
Signature Not Required

important:Retain this Copy for your
record


