In [84]:
"""
This code sample shows Prebuilt Layout operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Document Intelligence (formerly Form Recognizer) SDKs
https://learn.microsoft.com/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?pivots=programming-language-python
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from shapely.geometry import Polygon
import os
import PyPDF2

In [85]:

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""

endpoint = "" #Endpoint
key = "" #Key

# sample document
# formUrl = "https://drive.google.com/uc?export=download&id=1V8TWvnVma5kWOggGi7XFEeRRMDgss9lw"

document_analysis_client = DocumentAnalysisClient(
    endpoint=endpoint, credential=AzureKeyCredential(key)
)
pdf_path = '' #PDF File path
# 打開 PDF 檔案
with open(pdf_path, "rb") as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(reader.pages)
print(f"Total number of pages: {num_pages}")
# 根據 PDF 檔案名生成輸出的檔名
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
dir_name = os.path.dirname(pdf_path)
output_file_name = f"{base_name}.txt"
# 初始化列表來儲存最終的內容
document_content = ''


Total number of pages: 15


In [86]:
# Process_page Function
def process_page(page,pdf_path):
    with open(pdf_path, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f, locale="zh-Hant", pages=f"{page+1}-{page+2}"
        )
        result = poller.result()
        
    return result

# 暫時可用 還差一點

In [87]:
def process_document(result):
    # 初始化一個變數來儲存所有內容
    part_content = ""
    # 提取並儲存所有標題、表格標題和頁碼
    paragraphs_by_page = {}
    page_numbers = {}
    for paragraph in result.paragraphs:
        page_number = paragraph.bounding_regions[0].page_number
        if paragraph.role == "sectionHeading":
            if page_number not in paragraphs_by_page:
                paragraphs_by_page[page_number] = []
            paragraphs_by_page[page_number].append(paragraph)
        elif paragraph.role == "pageNumber":
            page_numbers[page_number] = paragraph.content.strip()

    # 遍歷每一頁
    for page in result.pages:
        page_content = ""
        table_polygons = []

        # 標記該頁面的表格範圍
        for table in result.tables:
            for region in table.bounding_regions:
                if region.page_number == page.page_number:
                    polygon = Polygon([(p.x, p.y) for p in region.polygon])
                    table_polygons.append(polygon)

        # 標記是否存在非表格的文字內容
        text_content_exists = False
        for line in page.lines:
            line_polygon = Polygon([(p.x, p.y) for p in line.polygon])
            in_table = any(table_polygon.intersects(line_polygon) for table_polygon in table_polygons)
            if not in_table:
                page_content += line.content + "\n"
                text_content_exists = True

        # 檢查該頁是否有表格標題
        if page.page_number in paragraphs_by_page:
            for paragraph in paragraphs_by_page[page.page_number]:
                # 插入表格標題
                page_content += f"\n{paragraph.content.strip()}\n"

        # 處理該頁的表格內容，無論是否有其他文字內容
        for table in result.tables:
            for region in table.bounding_regions:
                if region.page_number == page.page_number:
                    table_content = [['' for _ in range(table.column_count)] for _ in range(table.row_count)]
                    for cell in table.cells:
                        table_content[cell.row_index][cell.column_index] = cell.content

                    formatted_table = ""
                    for row in table_content:
                        formatted_table += "| " + " | ".join(row) + " |\n"

                    # 插入表格內容
                    page_content += formatted_table

        # 在頁面內容後加上頁碼（如果有）
        if page.page_number in page_numbers:
            page_content += f"\nPage number: {page_numbers[page.page_number]}\n"

        # 將該頁的內容添加到最終內容中
        part_content += page_content
    
    return part_content


# 持續修改

In [76]:
from shapely.geometry import Polygon

def process_document(result):
    # 初始化一個變數來儲存所有內容
    part_content = ""
    processed_tables = set()  # 用於追蹤已處理過的表格

    # 遍歷每一頁
    for page in result.pages:
        page_content = ""
        previous_section_heading = None

        # 依據順序處理頁面中的每個元素
        elements = sorted(
            result.paragraphs + result.tables,
            key=lambda x: (x.bounding_regions[0].page_number, x.bounding_regions[0].polygon[0].y)
        )

        for element in elements:
            page_number = element.bounding_regions[0].page_number

            # 檢查元素類型是段落還是表格
            if hasattr(element, 'role') and element.role == "sectionHeading":
                # 如果是標題，記住它，並添加到內容中
                previous_section_heading = element.content.strip()
                page_content += f"{previous_section_heading}\n"

            elif hasattr(element, 'cells') and id(element) not in processed_tables:  # 判斷是否為表格
                # 在表格前插入最近的標題（如果有）
                if previous_section_heading:
                    page_content += f"\n{previous_section_heading}\n"
                    previous_section_heading = None  # 標題只使用一次

                # 格式化並插入表格
                table_content = [['' for _ in range(element.column_count)] for _ in range(element.row_count)]
                for cell in element.cells:
                    table_content[cell.row_index][cell.column_index] = cell.content

                formatted_table = ""
                for row in table_content:
                    formatted_table += "| " + " | ".join(row) + " |\n"

                page_content += formatted_table

                # 標記表格為已處理
                processed_tables.add(id(element))

            elif hasattr(element, 'content'):  # 確保只有段落被處理
                # 如果是其他普通段落，直接添加內容
                page_content += element.content + "\n"

        # 在頁面內容後加上頁碼（如果有）
        page_number_paragraph = next((p for p in result.paragraphs if p.role == "pageNumber" and p.bounding_regions[0].page_number == page.page_number), None)
        if page_number_paragraph:
            page_content += f"\nPage number: {page_number_paragraph.content.strip()}\n"

        # 將該頁的內容添加到最終內容中
        part_content += page_content

    return part_content


In [88]:
# 初始化列表來儲存最終的內容
# document_content = ''
# for page in range(0, 3,+2):
for page in range(0, num_pages,+2):
    print(f"Processing page {page+1}~{page+2}...")
    result = process_page(page,pdf_path)
    document_content += process_document(result)
# 將所有內容寫入一個檔案
with open(f"{dir_name}\\{output_file_name}", "w", encoding="utf-8") as f:
    f.write(document_content)
print(f"內容已成功寫入 {dir_name}\\{output_file_name}檔案中。")

Processing page 1~2...
Processing page 3~4...
Processing page 5~6...
Processing page 7~8...
Processing page 9~10...
Processing page 11~12...
Processing page 13~14...
Processing page 15~16...
內容已成功寫入 C:\Users\11208045\Downloads\20240816_規格對照資料\Icisive_CT\Incisive 仿單.txt檔案中。


# 檢查用程式碼

In [65]:
with open(pdf_path, "rb") as f:
    poller = document_analysis_client.begin_analyze_document(
        "prebuilt-layout", document=f, locale="zh-Hant", pages="11"
    )
    result = poller.result()

In [None]:
# 初始化一個變數來儲存所有內容
part_content = ""
# 提取並儲存所有標題、表格標題和頁碼
paragraphs_by_page = {}
page_numbers = {}
for paragraph in result.paragraphs:
    page_number = paragraph.bounding_regions[0].page_number
    if paragraph.role == "sectionHeading":
        if page_number not in paragraphs_by_page:
            paragraphs_by_page[page_number] = []
        paragraphs_by_page[page_number].append(paragraph)
    elif paragraph.role == "pageNumber":
        page_numbers[page_number] = paragraph.content.strip()

# 遍歷每一頁
for page in result.pages:
    page_content = ""
    table_polygons = []

    # 標記該頁面的表格範圍
    for table in result.tables:
        for region in table.bounding_regions:
            if region.page_number == page.page_number:
                polygon = Polygon([(p.x, p.y) for p in region.polygon])
                table_polygons.append(polygon)

    # 標記是否存在非表格的文字內容
    text_content_exists = False
    for line in page.lines:
        line_polygon = Polygon([(p.x, p.y) for p in line.polygon])
        in_table = any(table_polygon.intersects(line_polygon) for table_polygon in table_polygons)
        if not in_table:
            page_content += line.content + "\n"
            text_content_exists = True

    # 檢查該頁是否有表格標題
    if page.page_number in paragraphs_by_page:
        for paragraph in paragraphs_by_page[page.page_number]:
            # 插入表格標題
            page_content += f"\n{paragraph.content.strip()}\n"

    # 處理該頁的表格內容，無論是否有其他文字內容
    for table in result.tables:
        for region in table.bounding_regions:
            if region.page_number == page.page_number:
                table_content = [['' for _ in range(table.column_count)] for _ in range(table.row_count)]
                for cell in table.cells:
                    table_content[cell.row_index][cell.column_index] = cell.content

                formatted_table = ""
                for row in table_content:
                    formatted_table += "| " + " | ".join(row) + " |\n"

                # 插入表格內容
                page_content += formatted_table

    # 在頁面內容後加上頁碼（如果有）
    if page.page_number in page_numbers:
        page_content += f"\nPage number: {page_numbers[page.page_number]}\n"

    # 將該頁的內容添加到最終內容中
    part_content += page_content

# Original output

In [None]:
for idx, style in enumerate(result.styles):
    print(
        "Document contains {} content".format(
         "handwritten" if style.is_handwritten else "no handwritten"
        )
    )
for page in result.pages:
    for line_idx, line in enumerate(page.lines):
        print(
         "...Line # {} has text content '{}'".format(
        line_idx,
        # line.content.encode("utf-8")
        line.content
        )
    )

    # for selection_mark in page.selection_marks:
    #     print(
    #      "...Selection mark is '{}' and has a confidence of {}".format(
    #      selection_mark.state,
    #      selection_mark.confidence
    #      )
    # )
    for table_idx, table in enumerate(result.tables):
        print(
            "Table # {} has {} rows and {} columns".format(
            table_idx, table.row_count, table.column_count
            )
        )
            
        for cell in table.cells:
            print(
                "...Cell[{}][{}] has content '{}'".format(
                cell.row_index,
                cell.column_index,
                # cell.content.encode("utf-8"),
                cell.content
                )
            )