# Azure AI Document Intelligence の動作確認

In [None]:
from typing_extensions import Any

In [None]:
import sys

sys.path.append("../src")

In [None]:
from common.load_config import get_input_dir, get_output_dir

input_dir = get_input_dir()
output_dir = get_output_dir()

path_input_file = input_dir / "2_15.pdf"

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
AZURE_AI_SERVICES_API_KEY = os.getenv("AZURE_AI_SERVICES_API_KEY")
AZURE_AI_SERVICES_ENDPOINT = os.getenv("AZURE_AI_SERVICES_ENDPOINT")

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, AnalyzeResult

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=AZURE_AI_SERVICES_ENDPOINT,
    credential=AzureKeyCredential(AZURE_AI_SERVICES_API_KEY),
)

with open(path_input_file, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        model_id="prebuilt-layout",
        body=f,
        output_content_format="markdown",
    )

result: AnalyzeResult = poller.result()

In [None]:
print(type(result))

## contents

In [None]:
print(result.content)

In [None]:
from common.file_utils import str_to_md_file

output_dir = get_output_dir()
path_output_file = output_dir / "2_15.md"
str_to_md_file(result.content, path_output_file)

## sections

In [None]:
print(result.sections)

In [None]:
def get_sections(result: AnalyzeResult) -> list[list[str]]:
    sections = []
    for section in result.sections:
        sections.append(section.elements)
    return section

In [None]:
get_sections(result)

## paragraphs

In [None]:
print(result.paragraphs)

In [None]:
def get_paragraphs(result: AnalyzeResult) -> list[dict[str,str]]:
    paragraphs = []
    for idx, paragraph in enumerate(result.paragraphs):
        item = {
            "id": "/paragraphs/" + str(idx),
            "content": paragraph.content if paragraph.content else "",
            "role": paragraph.role if paragraph.role else "",
            "polygon": paragraph.get("boundingRegions")[0]["polygon"],
            "pageNumber": paragraph.get("boundingRegions")[0]["pageNumber"],
        }
        paragraphs.append(item)
    return paragraphs

In [None]:
get_paragraphs(result)

## tables

In [None]:
print(result.tables)

In [None]:
def get_tables(result: AnalyzeResult) -> list[dict[str,Any]]:
    tables = []
    for _, table in enumerate(result.tables):
        cells = []
        for cell in table.cells: 
            cells.append( {
                "row_index": cell.row_index,
                "column_index": cell.column_index,
                "content": cell.content,
            })
        tab = {
                "row_count": table.row_count,
                "column_count": table.column_count,
                "cells": cells
        }
        tables.append(tab)
        return tables

In [None]:
get_tables(result)

## figures

In [None]:
print(result.figures)

In [None]:
def print_figures(result: AnalyzeResult) -> None:
    if result.figures is None:
        print("figureが存在しません")
        pass
    for idx, figures in enumerate(result.figures):
        print(f"--------Analysis of Figures #{idx + 1}--------")

        if figures.caption:
            title = figures.caption.get("content")
            if title:
                print(f"Caption: {title}")

            elements = figures.caption.get("elements")
            if elements:
                print("...caption elements involved:")
                for item in elements:
                  print(f"......Item #{item}")

            captionBR = []
            caption_boundingRegions = figures.caption.get("boundingRegions")
            if caption_boundingRegions:
                print("...caption bounding regions involved:")
                for item in caption_boundingRegions:
                    #print(f"...Item #{item}")
                    print(f"......Item pageNumber: {item.get('pageNumber')}")
                    print(f"......Item polygon: {item.get('polygon')}")
                    captionBR = item.get('polygon')

        if figures.elements:
            print("Elements involved:")
            for item in figures.elements:
                print(f"...Item #{item}")

        boundingRegions = figures.get("boundingRegions")
        if boundingRegions:
            print("Bounding regions involved:")
            for item in boundingRegions:
                #print(f"...Item #{item}")
                if captionBR != item.get('polygon'): #caption の polygon を除外したい
                    print(f"......Item pageNumber: {item.get('pageNumber')}")
                    print(f"......Item polygon: {item.get('polygon')}")


In [None]:
print_figures(result)

## 図の切り出しと保存

In [None]:
from PIL import Image
import fitz  # PyMuPDF
import mimetypes
from mimetypes import guess_type
def crop_image_from_image(image_path, page_number, bounding_box):
    """
    Crops an image based on a bounding box.

    :param image_path: Path to the image file.
    :param page_number: The page number of the image to crop (for TIFF format).
    :param bounding_box: A tuple of (left, upper, right, lower) coordinates for the bounding box.
    :return: A cropped image.
    :rtype: PIL.Image.Image
    """
    with Image.open(image_path) as img:
        if img.format == "TIFF":
            # Open the TIFF image
            img.seek(page_number)
            img = img.copy()
            
        # The bounding box is expected to be in the format (left, upper, right, lower).
        cropped_image = img.crop(bounding_box)
        return cropped_image

def crop_image_from_pdf_page(pdf_path, page_number, bounding_box):
    """
    Crops a region from a given page in a PDF and returns it as an image.

    :param pdf_path: Path to the PDF file.
    :param page_number: The page number to crop from (0-indexed).
    :param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
    :return: A PIL Image of the cropped area.
    """
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    
    # Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
    # The coordinates are in points (1/72 inch).
    bbx = [x * 72 for x in bounding_box]
    rect = fitz.Rect(bbx)
    pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), clip=rect)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    doc.close()

    return img

def crop_image_from_file(file_path, page_number, bounding_box):
    """
    Crop an image from a file.

    Args:
        file_path (str): The path to the file.
        page_number (int): The page number (for PDF and TIFF files, 0-indexed).
        bounding_box (tuple): The bounding box coordinates in the format (x0, y0, x1, y1).

    Returns:
        A PIL Image of the cropped area.
    """
    mime_type = mimetypes.guess_type(file_path)[0]
    
    if mime_type == "application/pdf":
        return crop_image_from_pdf_page(file_path, page_number, bounding_box)
    else:
        return crop_image_from_image(file_path, page_number, bounding_box)
