In [None]:
import os

ending = "/papers/2502.01061"
base_url = "https://arxiv.org/pdf/"
if "papers" in ending:
    ending = ending.replace("/papers/", "")
url = os.path.join(base_url, ending)




: 

In [7]:
import fitz
doc = fitz.open("first_paper.pdf")
for i, page in enumerate(doc):
    pix = page.get_pixmap(dpi=150)  # or higher DPI
    pix.save(f"page_{i}.png")


In [10]:
import cv2
import numpy as np

def find_empty_areas(image_path, 
                     threshold_value=180, 
                     morph_kernel_size=(5,5), 
                     min_area=5000):
    """
    Find areas on a page image that do NOT contain text (or are mostly white).
    
    Parameters
    ----------
    image_path : str
        Path to the input page image (ideally a grayscale PNG/JPEG).
    threshold_value : int
        Threshold for binarization. Adjust if your document is lighter/darker.
    morph_kernel_size : (int, int)
        Size of the structuring element for morphological operations.
        Larger kernel can unify text lines and small text blocks more strongly.
    min_area : int
        Minimum area (in pixels) for a region to be considered a valid "empty" area.
        Adjust based on image resolution and how large you want the empty area to be.
    
    Returns
    -------
    empty_bboxes : list of tuples
        List of bounding boxes (x, y, w, h) for each region that appears free of text.
    mask_empty : np.ndarray (2D, uint8)
        A binary mask (same size as input image) where empty areas are white (255)
        and text areas are black (0).
    """
    # 1) Load image in grayscale
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(f"Could not load image: {image_path}")
    
    # 2) Binarize (invert so text is white on black if you prefer)
    #    For this example, let's keep text as black on white
    _, bin_img = cv2.threshold(img, threshold_value, 255, cv2.THRESH_BINARY)
    
    # 3) Morphological operations to expand/unify text regions
    #    If your text is dark on a light background, you want to unify black text.
    #    So we invert the image to make text white for dilation:
    bin_inverted = 255 - bin_img
    
    # Increase "white" (text) areas so that scattered text is grouped together
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, morph_kernel_size)
    dilated = cv2.dilate(bin_inverted, kernel, iterations=1)
    
    # 4) We'll consider the "text areas" as everything that is white in `dilated`.
    #    The complement of that are the "empty" areas.
    #    Create a mask for text regions
    text_mask = dilated
    
    # Invert again to get empty areas in white
    # Now: text_mask=255 means text, so empty_mask=255 means empty
    empty_mask = 255 - text_mask
    
    # 5) Find contours in the empty_mask to locate blocks of empty space
    contours, _ = cv2.findContours(empty_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    empty_bboxes = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        area = w * h
        
        # Filter out small or trivial spaces
        if area > min_area:
            empty_bboxes.append((x, y, w, h))
    
    return empty_bboxes, empty_mask

# Example usage:
if __name__ == "__main__":
    bboxes, mask_empty = find_empty_areas("page_3.png",
                                          threshold_value=180,
                                          morph_kernel_size=(10,10),
                                          min_area=10000)
    
    print("Found empty bounding boxes:", bboxes)
    # Save the mask for visualization
    cv2.imwrite("page_0_empty_mask.png", mask_empty)


Found empty bounding boxes: [(0, 0, 1241, 1754)]


In [11]:
import cv2
import numpy as np

def find_large_black_regions(image_path,
                             threshold_value=128,
                             morph_kernel_size=(3, 3),
                             coverage_threshold=0.5,
                             min_area=2000):
    """
    Detect large black regions in a binary image that are likely images/figures,
    excluding smaller or sparse text blocks.

    Parameters
    ----------
    image_path : str
        Path to the input document page image (preferably grayscale).
    threshold_value : int
        Threshold value for binarization (0-255). Adjust as needed.
    morph_kernel_size : (int, int)
        Kernel size for optional morphological operations to unify large blocks.
    coverage_threshold : float
        Minimum coverage ratio to consider a region "large black region."
        E.g., 0.5 means at least 50% of the bounding box must be black.
    min_area : int
        Minimum area in pixels of the bounding box to be considered. 
        Excludes tiny specks (and presumably text lines if very small).

    Returns
    -------
    image_bboxes : list of (x, y, w, h)
        Bounding boxes of the detected image-like regions.
    mask_images : np.ndarray
        A binary mask (same size as the input) where the large black regions are white.
    """

    # 1) Load the image in grayscale
    gray = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if gray is None:
        raise FileNotFoundError(f"Could not load image: {image_path}")
    
    # 2) Binarize (ensure black text/figures = 0, white background = 255)
    #    Depending on the document, you might invert. But let's assume black on white.
    _, bin_img = cv2.threshold(gray, threshold_value, 255, cv2.THRESH_BINARY)
    
    # OPTIONAL: Morphological closing to unify dark regions
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, morph_kernel_size)
    closed = cv2.morphologyEx(bin_img, cv2.MORPH_CLOSE, kernel, iterations=1)
    
    # 3) Find contours of dark regions (which are 0 in a 0-255 image).
    #    We can invert so contours find white shapes, or we specify RETR_EXTERNAL for black shapes.
    #    Often easier to invert so the shapes become white:
    inverted = 255 - closed  
    contours, _ = cv2.findContours(inverted, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    image_bboxes = []
    mask_images = np.zeros_like(bin_img, dtype=np.uint8)  # will mark found regions as white
    
    # 4) Analyze each contour
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        box_area = w * h
        if box_area < min_area:
            continue  # ignore very small objects
        
        # 5) Coverage ratio: how many black pixels are in this contour’s bounding box?
        #    Let's create a mask for this contour alone:
        contour_mask = np.zeros_like(bin_img, dtype=np.uint8)
        cv2.drawContours(contour_mask, [cnt], 0, color=255, thickness=-1)
        
        #    Count how many black/white pixels are in that bounding box region:
        #    In 'inverted', the region is white=255, so we can count it directly in 'contour_mask'
        #    or we can just count within the bounding box.
        contour_pixels = cv2.countNonZero(contour_mask[y:y+h, x:x+w])
        
        # coverage ratio = contour_pixels / bounding_box_area
        coverage = contour_pixels / float(box_area)
        
        # 6) If coverage is large enough, treat as an image block
        if coverage >= coverage_threshold:
            # Mark region in the mask
            cv2.drawContours(mask_images, [cnt], 0, color=255, thickness=-1)
            image_bboxes.append((x, y, w, h))

    return image_bboxes, mask_images


if __name__ == "__main__":
    # Example usage
    path = "page_3.png"
    bboxes, mask_img = find_large_black_regions(
        image_path=path,
        threshold_value=128,
        morph_kernel_size=(3, 3),
        coverage_threshold=0.5,  # 50% coverage
        min_area=2000
    )
    
    print("Detected bounding boxes for large black regions:")
    for bb in bboxes:
        print("  (x,y,w,h)=", bb)
    
    # Save the mask for visualization
    cv2.imwrite("large_black_regions_mask.png", mask_img)


Detected bounding boxes for large black regions:
  (x,y,w,h)= (327, 419, 30, 198)
  (x,y,w,h)= (612, 406, 51, 85)
  (x,y,w,h)= (383, 406, 53, 94)


In [12]:
import cv2

def extract_images_from_bboxes(image_path, bboxes, output_prefix="crop"):
    """
    Extract and save image crops from an input image using provided bounding boxes.

    Parameters
    ----------
    image_path : str
        Path to the source image (e.g., a page in PNG/JPG).
    bboxes : list of tuple
        List of (x, y, w, h) bounding boxes. Coordinates are in pixel space:
            x,y -> top-left corner
            w -> width
            h -> height
    output_prefix : str
        Prefix for the saved cropped images.

    Returns
    -------
    crops : list of np.ndarray
        A list of the cropped image regions (as arrays).
    """

    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Could not load image: {image_path}")
    
    crops = []

    # For each bounding box, crop the region and save
    for i, (x, y, w, h) in enumerate(bboxes):
        # Ensure coordinates are within image bounds
        # (in case the bbox extends off the image edge)
        x_end = min(x + w, image.shape[1])
        y_end = min(y + h, image.shape[0])

        # Crop
        crop_img = image[y:y_end, x:x_end]
        crops.append(crop_img)

        # Save to disk
        out_name = f"{output_prefix}_{i}.png"
        cv2.imwrite(out_name, crop_img)
        print(f"Saved crop {i} to {out_name}")

    return crops

# Example usage:
if __name__ == "__main__":
    # Suppose you have bounding boxes from some previous analysis
    found_bboxes = [
        (327, 419, 30, 198),  # (x, y, w, h)
        (500, 100, 200, 200)
    ]
    
    # Call the function
    cropped_images = extract_images_from_bboxes("page_0.png", found_bboxes, output_prefix="figure")


Saved crop 0 to figure_0.png
Saved crop 1 to figure_1.png


In [13]:
!pip install layoutparser "layoutparser[ocr]" detectron2==0.6 opencv-python


Collecting layoutparser
  Downloading layoutparser-0.3.4-py3-none-any.whl.metadata (7.7 kB)
[31mERROR: Could not find a version that satisfies the requirement detectron2==0.6 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for detectron2==0.6[0m[31m
[0m

In [14]:
import cv2
import layoutparser as lp

def extract_figures_with_layoutparser(image_path, 
                                      model_config="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config", 
                                      score_threshold=0.5):
    """
    Detect and extract figures from a document page image using LayoutParser's 
    Detectron2 model (trained on PubLayNet).

    Parameters
    ----------
    image_path : str
        Path to the page image (PNG/JPG).
    model_config : str
        Model config from the LayoutParser model zoo.
    score_threshold : float
        Confidence threshold for detected layouts.

    Returns
    -------
    figure_bboxes : list of tuple
        A list of bounding boxes (x1, y1, x2, y2) for detected figures.
    figure_crops : list of np.ndarray
        Corresponding cropped image regions for each figure.
    """

    # 1. Read the page image
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Could not load image file: {image_path}")

    # 2. Initialize LayoutParser’s Detectron2 model
    #    The label_map for PubLayNet usually is:
    #    0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"
    model = lp.Detectron2LayoutModel(
        config_path=model_config,
        extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", score_threshold],
        label_map={0:"Text", 1:"Title", 2:"List", 3:"Table", 4:"Figure"},
        device="cpu"  # or "cuda" if you have a GPU
    )

    # 3. Run detection on the image
    layout = model.detect(image)

    # 4. Filter out only 'Figure' blocks
    figure_blocks = [b for b in layout if b.type == "Figure"]

    figure_bboxes = []
    figure_crops = []

    # 5. Crop out each detected figure
    for i, fig_block in enumerate(figure_blocks):
        # LayoutParser stores coordinates in a Rect object
        x1, y1, x2, y2 = map(int, fig_block.block.coordinates)  # [left, top, right, bottom]

        # Ensure valid bounding box within image boundaries
        x1 = max(0, x1); y1 = max(0, y1)
        x2 = min(image.shape[1], x2)
        y2 = min(image.shape[0], y2)

        crop_img = image[y1:y2, x1:x2]
        figure_bboxes.append((x1, y1, x2, y2))
        figure_crops.append(crop_img)
    
    return figure_bboxes, figure_crops


if __name__ == "__main__":
    # Example usage:
    image_file = "page_0.png"  # a page you've converted from PDF
    boxes, crops = extract_figures_with_layoutparser(image_file, score_threshold=0.5)

    print("Detected figure bounding boxes:")
    for i, (x1, y1, x2, y2) in enumerate(boxes):
        print(f"Figure {i}: (x1={x1}, y1={y1}, x2={x2}, y2={y2})")
        # Save each cropped figure
        out_path = f"figure_{i}.png"
        cv2.imwrite(out_path, crops[i])
        print(f"  -> saved figure to {out_path}")


AttributeError: module layoutparser has no attribute Detectron2LayoutModel