In [7]:
pip install pdf2image

Defaulting to user installation because normal site-packages is not writeable
Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from pdf2image import convert_from_path
import cv2
import pytesseract
import re
import os
import shutil

app = FastAPI()

DEFAULT_COORDINATES = [(1730, 1466), (2380, 1466), (2380, 1670), (1730, 1670)]
DEFAULT_PATTERN = r'\b[kK]\d+\b'


def convert_pdf_page_to_image(pdf_path, extracted_images_folder, page_number, dpi=150):
    try:
        images = convert_from_path(pdf_path, dpi=dpi, first_page=page_number, last_page=page_number)
        if images:
            image_path = os.path.join(extracted_images_folder, f"page_{page_number}.jpg")
            images[0].save(image_path, 'JPEG')
            return image_path, None
        return None, "No images found for page."
    except Exception as e:
        return None, f"Error converting PDF page to image: {e}"


def extract_and_match_text(image_path, coordinates, pattern):
    try:
        image = cv2.imread(image_path)
        if image is None:
            return None, "Failed to load image."

        x_min, y_min = min(coordinates)[0], min(coordinates, key=lambda x: x[1])[1]
        x_max, y_max = max(coordinates)[0], max(coordinates, key=lambda x: x[1])[1]
        cropped_image = image[y_min:y_max, x_min:x_max]

        extracted_text = pytesseract.image_to_string(cropped_image).strip()
        matches = re.findall(pattern, extracted_text)

        if matches:
            return matches, None
        else:
            return None, "No matches found."

    except Exception as e:
        return None, f"Error: {e}"


@app.post("/upload_pdf/")
async def upload_pdf(
        file: UploadFile = File(...),
        extracted_images_folder: str = Form(...),
        pattern_images_folder: str = Form(...),
        coordinates: str = Form(None),  # Optional: "x1,y1;x2,y2;x3,y3;x4,y4"
        pattern: str = Form(None)
):
    try:
        pdf_path = f"/tmp/{file.filename}"
        with open(pdf_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)

        os.makedirs(extracted_images_folder, exist_ok=True)
        os.makedirs(pattern_images_folder, exist_ok=True)

        if coordinates:
            coordinates = [tuple(map(int, coord.split(','))) for coord in coordinates.split(';')]
        else:
            coordinates = DEFAULT_COORDINATES

        if not pattern:
            pattern = DEFAULT_PATTERN

        total_images = 0
        passed_images = 0
        failed_images = []

        total_pages = len(convert_from_path(pdf_path, dpi=160))  # Get total pages quickly with low dpi

        for page_number in range(1, total_pages + 1):
            image_path, error_message = convert_pdf_page_to_image(pdf_path, extracted_images_folder, page_number)
            if image_path:
                total_images += 1
                matches, error_message = extract_and_match_text(image_path, coordinates, pattern)
                if matches is not None:
                    passed_images += 1
                    pattern_image_path = os.path.join(pattern_images_folder, f"page_{page_number}.jpg")
                    cv2.imwrite(pattern_image_path, cv2.imread(image_path))  # Copy image to pattern_images_folder
                else:
                    failed_images.append(f"page_{page_number}.jpg")
            else:
                failed_images.append(f"page_{page_number}.jpg")

        report = {
            "total_images": total_images,
            "passed_images": passed_images,
            "failed_images_count": len(failed_images),
            "failed_images": failed_images
        }

        return JSONResponse(content=report)

    except Exception as e:
        return JSONResponse(content={"error": str(e)}, status_code=500)


In [5]:
import cv2
import pytesseract
from PIL import Image
import re
import os


# Function to extract text from specified coordinates
def extract_text_from_coordinates(image_path, coordinates):
    try:
        image = cv2.imread(image_path)
        if image is None:
            return None, "Failed to load image."

        img_height, img_width = image.shape[:2]
        if any(x < 0 or x >= img_width or y < 0 or y >= img_height for x, y in coordinates):
            return None, "Coordinates are out of bounds for the image."

        x_min, y_min = min(x for x, y in coordinates), min(y for x, y in coordinates)
        x_max, y_max = max(x for x, y in coordinates), max(y for x, y in coordinates)
        cropped_image = image[y_min:y_max, x_min:x_max]

        pil_image = Image.fromarray(cropped_image)
        extracted_text = pytesseract.image_to_string(pil_image)

        return extracted_text.strip(), None
    except Exception as e:
        return None, f"Error during text extraction: {e}"


# Function to process a single image file
def process_image_file(image_path):
    coordinates = [(1730, 1466), (2380, 1466), (2380, 1670), (1730, 1670)]
    pattern = r'K\d+'

    extracted_text, error_message = extract_text_from_coordinates(image_path, coordinates)
    if extracted_text:
        matches = re.findall(pattern, extracted_text)
        print(matches)
        for match in matches:
            print(match)
            print(f"Match in {os.path.basename(image_path)}: {match}")
        print(f"Total matches found: {len(matches)}")
    else:
        print(
            f"Error in {os.path.basename(image_path)}: {error_message}" if error_message else f"No matches found in {os.path.basename(image_path)}")

# Specify the path to your image file
image_path = '/home/harish/Documents/schucoPdf/schuco_images/page_377.jpg'

if __name__ == "__main__":
    process_image_file(image_path)


['K18112']
K18112
Match in page_377.jpg: K18112
Total matches found: 1
