In [5]:
import easyocr
import fitz  # PyMuPDF
from PIL import Image
import cv2
import numpy as np
import matplotlib.pyplot as plt
from groq import Groq
import os
from dotenv import load_dotenv

In [6]:
directories = [
    "../outputs/extracted_images",
    "../outputs/extracted_images_ocr",
    "../outputs/extracted_products_ocr"
]

# Create the directories if they do not exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

Directory already exists: ../outputs/extracted_images
Directory already exists: ../outputs/extracted_images_ocr
Directory already exists: ../outputs/extracted_products_ocr


In [19]:

def split_image_with_overlap_simple(image, window_size=(500, 500), overlap=250):
    height, width = image.shape[:2]
    window_w, window_h = window_size
    step_x = window_w - overlap
    step_y = window_h - overlap
   
    # Calculate remainders to see how much padding is needed
    remainder_x = width % step_x
    remainder_y = height % step_y

    child_images = []
    # Loop through the image with the specified step size
    for y in range(0, height - window_h + 1, step_y):
        for x in range(0, width - window_w + 1, step_x):
            # Extract the window and add it to the list
            child_image = image[y:y + window_h, x:x + window_w]
            child_images.append(child_image)  
    return child_images


In [None]:
def split_image_with_overlap_simple_padded_image(image, window_size=(500, 500), overlap=250):
    height, width = image.shape[:2]
    window_w, window_h = window_size
    step_x = window_w - overlap
    step_y = window_h - overlap

    # Calculate remainders to see how much padding is needed
    remainder_x = width % step_x
    remainder_y = height % step_y

    # Calculate padding amounts
    pad_right = (step_x - remainder_x) if remainder_x > 0 else 0
    pad_bottom = (step_y - remainder_y) if remainder_y > 0 else 0

    # Apply padding
    padded_image = cv2.copyMakeBorder(image, 0, pad_bottom, 0, pad_right, cv2.BORDER_CONSTANT, value=[0, 0, 0])
    padded_height, padded_width = padded_image.shape[:2]

    child_images = []
    # Loop through the padded image with the specified step size
    for y in range(0, padded_height - window_h + 1, step_y):
        for x in range(0, padded_width - window_w + 1, step_x):
            # Extract the window and add it to the list
            child_image = padded_image[y:y + window_h, x:x + window_w]
            child_images.append(child_image)
            
    return child_images

In [20]:
def is_mostly_white_or_black(image, threshold=0.95, color='white'):
    if color == 'white':
        # Set target pixel value for white (255) and tolerance
        target_value = 255
        tolerance = 10  # Adjust as needed for brightness variations
    elif color == 'black':
        # Set target pixel value for black (0) and tolerance
        target_value = 0
        tolerance = 10  # Adjust as needed for darkness variations
    else:
        raise ValueError("Color should be 'white' or 'black'")

    # Convert to grayscale if it's a color image
    if len(image.shape) == 3:
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray_image = image

    # Create a binary mask for pixels within the target value ± tolerance
    if color == 'white':
        mask = cv2.inRange(gray_image, target_value - tolerance, target_value)
    else:
        mask = cv2.inRange(gray_image, target_value, target_value + tolerance)

    # Calculate the percentage of target pixels
    target_pixel_ratio = np.sum(mask > 0) / mask.size

    return target_pixel_ratio >= threshold

In [21]:

# Load your big image
for page_num in range(1, 26):
    input_image= directories[0] + f"/page_{page_num}.png"
    big_image = cv2.imread(input_image)  # Load your large image
    child_images = split_image_with_overlap_simple(big_image, window_size=(400, 400), overlap=100)
    child_iamge_path = directories[1] + f"/page_{page_num}"
    if not os.path.exists(child_iamge_path):
        os.makedirs(child_iamge_path)

    for idx, child_image in enumerate(child_images):
        is_white = is_mostly_white_or_black(child_image, threshold=0.95, color='white')
        is_black = is_mostly_white_or_black(child_image, threshold=0.95, color='black')
        if is_white or is_black:
            continue
        cv2.imwrite(f"{child_iamge_path}/{idx}.png", child_image)

In [16]:
import cv2
import numpy as np

def calculate_overlap(box1, box2):
    """Calculate the intersection-over-union (IoU) between two boxes."""
    x1, y1, x2, y2 = box1
    x3, y3, x4, y4 = box2

    # Compute the intersection area
    inter_x1 = max(x1, x3)
    inter_y1 = max(y1, y3)
    inter_x2 = min(x2, x4)
    inter_y2 = min(y2, y4)

    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)

    # Compute areas of the individual boxes
    area_box1 = (x2 - x1) * (y2 - y1)
    area_box2 = (x4 - x3) * (y4 - y3)

    # Calculate overlap ratio of box2 within box1
    overlap_ratio = inter_area / area_box2 if area_box2 > 0 else 0
    return overlap_ratio

def get_union_box(window_box, boxes, max_size=1000):
    """Get the union of the window box and all overlapping boxes with size limit during computation."""
    x1, y1, x2, y2 = window_box

    for box in boxes:
        bx1, by1, bx2, by2 = box

        # Calculate potential new boundaries if we include this box
        new_x1, new_y1 = min(x1, bx1), min(y1, by1)
        new_x2, new_y2 = max(x2, bx2), max(y2, by2)

        # Check if the new boundaries would exceed max_size
        if (new_x2 - new_x1 <= max_size) and (new_y2 - new_y1 <= max_size):
            # Update the union bounds only if within max_size constraint
            x1, y1, x2, y2 = new_x1, new_y1, new_x2, new_y2

    return (x1, y1, x2, y2)

def split_image_with_overlap(image, boxes, window_size=(500, 500), overlap=250, max_size=1000):
    """Split image with overlapping windows and return regions with text coverage."""
    height, width = image.shape[:2]
    window_w, window_h = window_size
    step_x = window_w - overlap
    step_y = window_h - overlap

    # Calculate remainders to see how much padding is needed
    remainder_x = width % step_x
    remainder_y = height % step_y
    
    # Calculate padding amounts
    pad_right = (step_x - remainder_x) if remainder_x > 0 else 0
    pad_bottom = (step_y - remainder_y) if remainder_y > 0 else 0

    # Apply padding
    padded_image = cv2.copyMakeBorder(image, 0, pad_bottom, 0, pad_right, cv2.BORDER_CONSTANT, value=[0, 0, 0])
    height, width = padded_image.shape[:2]  # Update dimensions after padding

    # Store the extracted regions
    extracted_regions = []

    # Loop through the image with the specified step size
    for y in range(0, height - window_h + 1, step_y):
        for x in range(0, width - window_w + 1, step_x):
            # Define the current window box
            window_box = (x, y, x + window_w, y + window_h)

            # Check which boxes have >=60% overlap with this window
            overlapping_boxes = [box for box in boxes if calculate_overlap(window_box, box) >= 0.6]

            if overlapping_boxes:
                # Calculate the union of the window and overlapping boxes
                union_box = get_union_box(window_box, overlapping_boxes, max_size)

                # Extract the region from the padded image using the union box
                ux1, uy1, ux2, uy2 = union_box
                extracted_region = padded_image[uy1:uy2, ux1:ux2]

                # Add the extracted region and coordinates to the result list
                extracted_regions.append({
                    "image": extracted_region,
                    "coordinates": union_box
                })
            else:
                
                ux1, uy1, ux2, uy2 = window_box
                extracted_region = padded_image[uy1:uy2, ux1:ux2]
                extracted_regions.append({
                    "image": extracted_region,
                    "coordinates": extracted_region
                })


    return extracted_regions


In [17]:
import cv2
import easyocr
import numpy as np
from sklearn.cluster import DBSCAN

def load_image(image_path):
    """Load the image for OCR processing."""
    image = cv2.imread(image_path)
    return image

def expand_box(box, padding, H, W):
    """Expand the bounding box by a padding amount."""
    (start_x, start_y, end_x, end_y) = box
    start_x = max(0, start_x - padding)
    start_y = max(0, start_y - padding)
    end_x = min(W, end_x + padding)
    end_y = min(H, end_y + padding)
    return (start_x, start_y, end_x, end_y)

def detect_and_extract_paragraphs(image_path, padding=10, paragraph=True, x_ths = 1.0, y_ths = 0.5, detail=1):
    """Detect and extract paragraphs using EasyOCR with paragraph mode."""
    reader = easyocr.Reader(['en'])
    image = load_image(image_path)
    H, W = image.shape[:2]

    # Run OCR detection with paragraph mode
    ocr_results = reader.readtext(image, paragraph=paragraph, x_ths=x_ths, y_ths=y_ths, detail=detail)

    # Extracted text results with expanded boxes
    extracted_text = []
    boxes = []  # List to store bounding boxes for clustering

    for idx, result in enumerate(ocr_results):
        # result[0] contains the bounding box (paragraph mode returns an enclosing box)
        # result[1] contains the paragraph text
        # result[2] is the confidence score
                # Check if result contains 3 elements or just 2
        if len(result) == 3:
            bbox, text, prob = result
        else:
            bbox, text = result
            prob = None  # Set confidence to None if not provided

        # Convert bounding box to rectangular coordinates
        start_x = int(min(point[0] for point in bbox))
        start_y = int(min(point[1] for point in bbox))
        end_x = int(max(point[0] for point in bbox))
        end_y = int(max(point[1] for point in bbox))
        
        # Append the bounding box to the list
        boxes.append((start_x, start_y, end_x, end_y))

        # Append text results (without merging yet)
        extracted_text.append({
            "text": text,
            "box": (start_x, start_y, end_x, end_y)
        })

    return extracted_text, boxes
    

def visualize_results(image_path, extracted_text, image_name):
    """Visualize extracted text regions on the image."""
    image = load_image(image_path)

    for result in extracted_text:
        (start_x, start_y, end_x, end_y) = result['box']
        cv2.rectangle(image, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)
    cv2.imwrite(f'{directories[1]}/{image_name}', image)


In [18]:

# Load your big image
for page_num in range(1, 26):
    process_image=f"page_{page_num}"
    image_path = directories[0] + f"/{process_image}.png"
    extracted_text, ocr_boxes = detect_and_extract_paragraphs(image_path, padding=0, paragraph=True, x_ths=.05, y_ths=0.3, detail=1) 
    visualize_results(image_path=image_path, extracted_text=extracted_text, image_name=f"{process_image}_paragraph.png")

    input_image= directories[0] + f"/page_{page_num}.png"
    big_image = cv2.imread(input_image)  # Load your large image
    child_images = split_image_with_overlap(big_image, ocr_boxes, window_size=(400, 400), overlap=100, max_size=700)
    child_iamge_path = directories[1] + f"/page_{page_num}"
    if not os.path.exists(child_iamge_path):
        os.makedirs(child_iamge_path)

    for idx, extracted_image_info in enumerate(child_images):
        child_image = extracted_image_info["image"]
        is_white = is_mostly_white_or_black(child_image, threshold=0.95, color='white')
        is_black = is_mostly_white_or_black(child_image, threshold=0.95, color='black')
        if is_white or is_black:
            continue
        cv2.imwrite(f"{child_iamge_path}/{idx}.png", child_image)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster wi